In [None]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt

import pylab
import math

from sklearn.model_selection import train_test_split
from sklearn import metrics 
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV

In [None]:
#import the dataset
df = pd.read_csv("uber.csv")
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df = df.drop(['Unnamed: 0','key'], axis = 1)
df.head()

In [None]:
df.isnull().sum()

In [None]:
df['dropoff_latitude'].fillna(value=df['dropoff_latitude'].mean(),inplace=True)
df['dropoff_longitude'].fillna(value=df['dropoff_longitude'].median(),inplace=True)
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
df.pickup_datetime = pd.to_datetime(df.pickup_datetime, errors='coerce')
df.dtypes

In [None]:
df = df.assign( hour = df.pickup_datetime.dt.hour,
                day = df.pickup_datetime.dt.day,
                month = df.pickup_datetime.dt.month,
                year = df.pickup_datetime.dt.year,
                dayofweek = df.pickup_datetime.dt.dayofweek
              )
df.head()

In [None]:
# drop the column 'pickup_datetime' using drop()
# 'axis=1' drops the specified column
df = df.drop('pickup_datetime', axis=1)
df.dtypes

In [None]:
df.plot(kind="box",subplots=True,layout=(7,2),figsize=(15,20))

#Box plot to check the outliers using the Inter Quartile Range to fill the values
def remove_outlier(df1,col):
    Q1 = df1[col].quantile(0.25)
    Q3 = df1[col].quantile(0.75)
    IQR = Q3-Q1
    lower_whisker = Q1-1.5*IQR
    upper_whisker = Q3+1.5*IQR
    df[col] = np.clip(df1[col],lower_whisker,upper_whisker)
    return df1

def treat_outliers_all(df1,col_list):
    for c in col_list:
        df1 = remove_outlier(df,c)
        return df1

df = treat_outliers_all(df,df.iloc[:,0::])

#Box plot shows that dataset is free from outliers
df.plot(kind="box", subplots=True, layout=(7,2), figsize=(15,20))    

In [None]:
df.info()

In [None]:
# Finding incorrect latitude (less than or greater than 90) and longitude(greater than or less than 180)
incorrect_coordinates = df[(df["pickup_latitude"]>90) | (df["pickup_latitude"]<-90) | (df["dropoff_latitude"]>90) | (df["dropoff_latitude"]<-90) | 
                           (df["pickup_longitude"]>180) | (df["pickup_longitude"]<-180) | (df["dropoff_longitude"]>180) | (df["dropoff_longitude"]<-180)].index
df = df.drop(incorrect_coordinates)
df.reset_index(inplace=True)
df.info()

In [None]:
# pip install haversine
import haversine as hs

# Calculate the distance using Haversine to calculate the distance between two points. 
# Can't use Eucladian as it is for flat surface.
travel_dist = []
for pos in range(len(df['pickup_longitude'])):
    long1, lati1, long2, lati2 = [df['pickup_longitude'][pos], df['pickup_latitude'][pos],
                                  df['dropoff_longitude'][pos], df['dropoff_latitude'][pos]]
    loc1 = (lati1, long1)
    loc2 = (lati2, long2)
    
    c = hs.haversine(loc1, loc2)
    travel_dist.append(c)

travel_dist_df = pd.DataFrame(travel_dist)
travel_dist_df.head()

In [None]:
df['dist_travel_km'] = travel_dist_df

#Uber doesn't travel over 130kms so minimize the distance
df = df.loc[(df.dist_travel_km>=1)|(df.dist_travel_km<=130)]
print("Remaining observastions in the dataset :", df.shape)

#Correlation Heatmap(Light values means highly correlated)
fig, axis = plt.subplots(figsize=(10,6))
sns.heatmap(df.corr(), annot=True)

In [None]:
X = df[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count','hour','day','month','year','dayofweek']]
y = df['fare_amount']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.33)

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

print("Intercept :", lin_reg.intercept_)             #To find the linear intercept
print("\nLinear Coefficients :", lin_reg.coef_)      #To find the linear coeeficient

print()
prediction = lin_reg.predict(X_test)                 #To predict the target values
print(prediction)
y_test

In [None]:
from sklearn.metrics import r2_score
print("R2 Score :", r2_score(y_test, prediction))

from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(y_test,prediction)
RMSE = np.sqrt(MSE)
print("RMSE :", RMSE)

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100) 
# Here, n_estimators means number of trees you want to build before making the prediction
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
y_pred

In [None]:
R2_Random = r2_score(y_test,y_pred)
print("R2 Score :", R2_Random)

MSE_Random = mean_squared_error(y_test,y_pred)
RMSE_Random = np.sqrt(MSE_Random)
print("RMSE :", RMSE_Random)