In [2]:
"""Predict the price of the Uber ride from a given pickup point to the agreed
drop-off location. Perform following tasks:
1. Pre-process the dataset.
2. Identify outliers.
3. Check the correlation.
4. Implement linear regression
5. Evaluate the models and compare their respective scores like R2, RMSE, etc."""

'Predict the price of the Uber ride from a given pickup point to the agreed\ndrop-off location. Perform following tasks:\n1. Pre-process the dataset.\n2. Identify outliers.\n3. Check the correlation.\n4. Implement linear regression and random forest regression models.\n5. Evaluate the models and compare their respective scores like R2, RMSE, etc.'

In [22]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, precision_score, recall_score
from math import *

In [23]:
dataset = pd.read_csv(r"Dataset\uber.csv")
dataset.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [24]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         200000 non-null  int64  
 1   key                200000 non-null  object 
 2   fare_amount        200000 non-null  float64
 3   pickup_datetime    200000 non-null  object 
 4   pickup_longitude   200000 non-null  float64
 5   pickup_latitude    200000 non-null  float64
 6   dropoff_longitude  199999 non-null  float64
 7   dropoff_latitude   199999 non-null  float64
 8   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(2)
memory usage: 13.7+ MB


In [25]:
dataset.isnull().sum()

Unnamed: 0           0
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

In [26]:
dataset = dataset.drop(["Unnamed: 0","key"], axis = 1)

In [27]:
dataset.dropna(inplace = True)

In [28]:
dataset["pickup_datetime"] = pd.to_datetime(dataset["pickup_datetime"])

In [31]:
distance = []
def convertDistance(longitude1, latitude1, longitude2, latitude2):
    for pos in range(len(longitude1)):
        long1, lat1, long2, lat2 = map(radians, [longitude1[pos], latitude1[pos], longitude2[pos], latitude2[pos]])
        dis_long = long2 - long1
        dis_lat = lat2 - lat1
        a = sin(dis_lat/2)**2 + cos(lat1)*cos(lat2)*sin(dis_long/2)**2
        c = 2*asin(sqrt(a))*6371
        distance.append(c)
    return distance

In [32]:
dataset["distance"] = convertDistance(dataset['pickup_longitude'].to_numpy(), dataset['pickup_latitude'].to_numpy(), 
                                      dataset['dropoff_longitude'].to_numpy(), dataset['dropoff_latitude'].to_numpy())

In [39]:
dataset = dataset.assign(pickup_hour = dataset.pickup_datetime.dt.hour,
                         day_of_week = dataset.pickup_datetime.dt.day_of_week,
                        year = dataset.pickup_datetime.dt.year)

In [40]:
dataset.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance,pickup_hour,day_of_week,year
0,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,1.683323,19,3,2015
1,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1,2.45759,20,4,2009
2,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1,5.036377,21,0,2009
3,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,1.661683,8,4,2009
4,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,4.47545,17,3,2014


In [49]:
def findOutliers(dataset):
    Q1 = dataset.quantile(0.25)
    Q3 = dataset.quantile(0.75)
    IQT = Q3 - Q1
    outliers = dataset[((dataset < (Q1 - 1.5*IQT)) | (dataset > (Q3 + 1.5*IQT)))]
    return outliers

In [53]:
outliers = findOutliers(dataset['distance'])
print("The Outlier Number is ", len(outliers))
print("The Max Value of Outlier is ", outliers.max()) 
print("The Min Value of Outlier is ", outliers.min()) 

The Outlier Number is  16755
The Max Value of Outlier is  16409.239135313168
The Min Value of Outlier is  7.865146221381303


In [54]:
outliers = findOutliers(dataset['fare_amount'])
print("The Outlier Number is ", len(outliers))
print("The Max Value of Outlier is ", outliers.max()) 
print("The Min Value of Outlier is ", outliers.min()) 

The Outlier Number is  17166
The Max Value of Outlier is  499.0
The Min Value of Outlier is  -52.0


In [55]:
outliers = findOutliers(dataset['pickup_hour'])
print("The Outlier Number is ", len(outliers))
print("The Max Value of Outlier is ", outliers.max()) 
print("The Min Value of Outlier is ", outliers.min()) 

The Outlier Number is  0
The Max Value of Outlier is  nan
The Min Value of Outlier is  nan


In [56]:
outliers = findOutliers(dataset['year'])
print("The Outlier Number is ", len(outliers))
print("The Max Value of Outlier is ", outliers.max()) 
print("The Min Value of Outlier is ", outliers.min()) 

The Outlier Number is  0
The Max Value of Outlier is  nan
The Min Value of Outlier is  nan


In [59]:
outliers = findOutliers(dataset['passenger_count'])
print("The Outlier Number is ", len(outliers))
print("The Max Value of Outlier is ", outliers.max()) 
print("The Min Value of Outlier is ", outliers.min()) 

The Outlier Number is  22557
The Max Value of Outlier is  208
The Min Value of Outlier is  4


In [64]:
dataset.drop(dataset[dataset["distance"] == 0].index, inplace = True)
dataset.drop(dataset[dataset["distance"] > 60].index, inplace = True)
dataset.drop(dataset[dataset["fare_amount"] <= 0].index, inplace = True)
dataset.drop(dataset[dataset["fare_amount"] > 200].index, inplace = True)

In [66]:
X = dataset[["distance", "pickup_hour",	"day_of_week", "year"]]
y = dataset["fare_amount"]

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [68]:
reg = LinearRegression()
reg.fit(X_train, y_train)

In [69]:
y_pred = reg.predict(X_test)

In [71]:
print("Accuracy is ", r2_score(y_test, y_pred))

Accuracy is  0.7950086244195735
