In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
df = pd.read_csv("uber.csv")
df.head()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
# check for null values in the data-frame
df.isna().sum()

# dropoff_longitude and dropoff_latitude have a single null value
# remove those rows from the data-frame
df.dropna(inplace=True)

In [None]:
# Feature Transformation
df.drop(['Unnamed: 0', 'key'], axis=1, inplace=True)

In [None]:
# remove date from the `pickup_datetime` column
# and keep only 'hour of the day'
df["pickup_datetime"] = pd.to_datetime(df["pickup_datetime"]).dt.strftime("%H").astype('float32')
df = df.rename(columns={"pickup_datetime": "time"})

In [None]:
import math

# add distance between pickup location and dropoff location
# in the df

def euclidean_distance(lat1, lon1, lat2, lon2):
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    distance = math.sqrt(dlat**2 + dlon**2)
    return distance

df['distance'] = df.apply(
    lambda row: euclidean_distance(
        row['pickup_latitude'], 
        row['pickup_longitude'], 
        row['dropoff_latitude'], 
        row['dropoff_longitude']
    ), axis=1
)

In [None]:
# Outlier Analysis
# remove outlier(s) where passenger_count > 100
sns.scatterplot(df, y="fare_amount", x="passenger_count")
df = df[df["passenger_count"] < 150]

In [None]:
# remove outliers from pickup/dropoff locations
def remove_outliers(feature):
    global df
    q3 , q1 = np.percentile( df[feature] , [ 75 , 25 ] )
    iqr = q3 - q1
    df = df[ (df[feature] >= q1 - 1.5 * iqr) & (df[feature] <= q3 + 1.5 * iqr) ]
    
remove_outliers("pickup_latitude")
remove_outliers("pickup_longitude")
remove_outliers("dropoff_latitude")
remove_outliers("dropoff_longitude")

In [None]:
df.corr(method="pearson")

In [None]:
# Model Fitting
# Linear Regression

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# train-test split
X = df.drop('fare_amount', axis=1)
y = df['fare_amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# standardize the splits
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# We cannot use the fit() method on the test dataset, because
# it could introduce bias to the testing dataset. So, we apply the transform() 
# method directly on the test dataset.
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LinearRegression

# linear regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import root_mean_squared_error

print("R2 score: ", r2_score(y_test, y_pred))
print("RMSE: ", root_mean_squared_error(y_test, y_pred))

In [15]:
# Random Forest Regression

from sklearn.ensemble import RandomForestRegressor

# random forest regression model
# takes more time to train (comeback after 2 mins)
model = RandomForestRegressor()
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

In [16]:
from sklearn.metrics import r2_score
from sklearn.metrics import root_mean_squared_error

print("R2 score: ", r2_score(y_test, y_pred))
print("RMSE: ", root_mean_squared_error(y_test, y_pred))

R2 score:  0.5411579883434852
RMSE:  3.265543568331403
