In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

In [3]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from pprint import pprint
%matplotlib inline

In [4]:
# Setting up the work directory

os.chdir("D:/Edwisor/Cab Fare Prediction")
print(os.getcwd())

D:\Edwisor\Cab Fare Prediction


In [5]:
# Loading data
train = pd.read_csv("train_cab.csv", na_values={"pickup_datetime":"43"})
test = pd.read_csv("test.csv")


**<h2>Understanding the data:</h2>**

In [6]:
train.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1.0
1,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1.0
2,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2.0
3,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1.0
4,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1.0


In [7]:
test.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24 UTC,-73.97332,40.763805,-73.98143,40.743835,1
1,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44 UTC,-73.982524,40.75126,-73.979654,40.746139,1
3,2012-12-01 21:12:12 UTC,-73.98116,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12 UTC,-73.966046,40.789775,-73.988565,40.744427,1


In [8]:
print("Shape of training data is: ",train.shape)
print("Shape of test data is: ",test.shape)

Shape of training data is:  (16067, 7)
Shape of test data is:  (9914, 6)


In [9]:
train.dtypes

fare_amount           object
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count      float64
dtype: object

In [10]:
test.dtypes

pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

In [11]:
train.describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,16067.0,16067.0,16067.0,16067.0,16012.0
mean,-72.462787,39.914725,-72.462328,39.897906,2.62507
std,10.578384,6.826587,10.575062,6.187087,60.844122
min,-74.438233,-74.006893,-74.429332,-74.006377,0.0
25%,-73.992156,40.734927,-73.991182,40.734651,1.0
50%,-73.981698,40.752603,-73.980172,40.753567,1.0
75%,-73.966838,40.767381,-73.963643,40.768013,2.0
max,40.766125,401.083332,40.802437,41.366138,5345.0


In [12]:
test.describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,9914.0,9914.0,9914.0,9914.0,9914.0
mean,-73.974722,40.751041,-73.973657,40.751743,1.671273
std,0.042774,0.033541,0.039072,0.035435,1.278747
min,-74.252193,40.573143,-74.263242,40.568973,1.0
25%,-73.992501,40.736125,-73.991247,40.735254,1.0
50%,-73.982326,40.753051,-73.980015,40.754065,1.0
75%,-73.968013,40.767113,-73.964059,40.768757,2.0
max,-72.986532,41.709555,-72.990963,41.696683,6.0


**<h2>Data Cleaning & Missing Value Analysis: </h2>**

In [13]:
# Converting fare amount to numeric
train["fare_amount"] = pd.to_numeric(train["fare_amount"], errors='coerce') # Using coerce will replace any non numeric value with NaN.

In [14]:
train.dtypes

fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count      float64
dtype: object

In [15]:
train.shape

(16067, 7)

In [16]:
train.dropna(subset=["pickup_datetime"]) # Dropping NA values rfom datetime column.

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.841610,40.712278,1.0
1,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1.0
2,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.761270,-73.991242,40.750562,2.0
3,7.7,2012-04-21 04:30:42 UTC,-73.987130,40.733143,-73.991567,40.758092,1.0
4,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1.0
...,...,...,...,...,...,...,...
16062,6.5,2014-12-12 07:41:00 UTC,-74.008820,40.718757,-73.998865,40.719987,1.0
16063,16.1,2009-07-13 07:58:00 UTC,-73.981310,40.781695,-74.014392,40.715527,2.0
16064,8.5,2009-11-11 11:19:07 UTC,-73.972507,40.753417,-73.979577,40.765495,1.0
16065,8.1,2010-05-11 23:53:00 UTC,-73.957027,40.765945,-73.981983,40.779560,1.0


In [17]:
# Change data type of pickup_datetime from object to datetime.
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'], format='%Y-%m-%d %H:%M:%S UTC')

In [18]:
#Separate the pickup_datetime columns to different columns i.e year, month, day 

train['year'] = train['pickup_datetime'].dt.year
train['Month'] = train['pickup_datetime'].dt.month
train['Date'] = train['pickup_datetime'].dt.day
train['Day'] = train['pickup_datetime'].dt.dayofweek
train['Hour'] = train['pickup_datetime'].dt.hour
train['Minute'] = train['pickup_datetime'].dt.minute

In [19]:
train.dtypes

fare_amount                 float64
pickup_datetime      datetime64[ns]
pickup_longitude            float64
pickup_latitude             float64
dropoff_longitude           float64
dropoff_latitude            float64
passenger_count             float64
year                        float64
Month                       float64
Date                        float64
Day                         float64
Hour                        float64
Minute                      float64
dtype: object

In [20]:
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'],format= "%Y-%m-%d %H:%M:%S UTC")

In [21]:
#Separate the pickup_datetime columns to different columns i.e year, month, day 

test['year'] = test['pickup_datetime'].dt.year
test['Month'] = test['pickup_datetime'].dt.month
test['Date'] = test['pickup_datetime'].dt.day
test['Day'] = test['pickup_datetime'].dt.dayofweek
test['Hour'] = test['pickup_datetime'].dt.hour
test['Minute'] = test['pickup_datetime'].dt.minute

In [22]:
test.dtypes

pickup_datetime      datetime64[ns]
pickup_longitude            float64
pickup_latitude             float64
dropoff_longitude           float64
dropoff_latitude            float64
passenger_count               int64
year                          int64
Month                         int64
Date                          int64
Day                           int64
Hour                          int64
Minute                        int64
dtype: object

In [23]:
# Removing the missing values from datetime rows
train = train.drop(train[train['pickup_datetime'].isnull()].index, axis=0)
print(train.shape)
print(train['pickup_datetime'].isnull().sum())

(16066, 13)
0


Checking the passenger count variable:

In [24]:
train['passenger_count'].describe()

count    16011.000000
mean         2.625171
std         60.846021
min          0.000000
25%          1.000000
50%          1.000000
75%          2.000000
max       5345.000000
Name: passenger_count, dtype: float64

We can see that the maximum number of passenger count is 5345 which is not possible. So reducing
the passenger number to 6.

In [25]:
train = train.drop(train[train['passenger_count']>6].index, axis=0)

In [26]:
# Removing the values with passenger count of 0.
train = train.drop(train[train['passenger_count']==0].index, axis=0)

In [27]:
train['passenger_count'].describe()

count    15934.000000
mean         1.649581
std          1.265943
min          0.120000
25%          1.000000
50%          1.000000
75%          2.000000
max          6.000000
Name: passenger_count, dtype: float64

In [28]:
train['passenger_count'].sort_values(ascending=True)

8862     0.12
0        1.00
9790     1.00
9791     1.00
9792     1.00
         ... 
8076      NaN
8139      NaN
8259      NaN
8306      NaN
16066     NaN
Name: passenger_count, Length: 15989, dtype: float64

In [29]:
# removing the missing values from passenger count rows
train = train.drop(train[train['passenger_count'].isnull()].index, axis=0)
print(train.shape)
print(train['passenger_count'].isnull().sum())

(15934, 13)
0


Removing the fractional value of passenger count as that is not possible.

In [30]:
train = train.drop(train[train['passenger_count']==0.12].index, axis=0)
train.shape

(15933, 13)

Analysing the pickup lattitude and longitude

In [31]:
#As lattitude range is from (-90 to 90) and Longitude range is (-180 to 180), we need to drop the rows outside these ranges.

train[train['pickup_latitude']<-90]
train[train['pickup_latitude']>90]

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,Month,Date,Day,Hour,Minute
5686,3.3,2011-07-30 11:15:00,-73.947235,401.083332,-73.951392,40.778927,1.0,2011.0,7.0,30.0,5.0,11.0,15.0


In [32]:
# Dropping the only value
train = train.drop(train[train['pickup_latitude']<-90].index, axis=0)
train = train.drop(train[train['pickup_latitude']>90].index, axis=0)

In [33]:
train[train['pickup_longitude']<-180]
train[train['pickup_longitude']>180]

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,Month,Date,Day,Hour,Minute


In [34]:
train[train['dropoff_latitude']<-90]
train[train['dropoff_latitude']>90]

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,Month,Date,Day,Hour,Minute


In [35]:
train[train['dropoff_longitude']<-180]
train[train['dropoff_longitude']>180]

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,Month,Date,Day,Hour,Minute


Only one out of range value is present.

In [36]:
train.shape

(15932, 13)

In [37]:
train.isnull().sum()

fare_amount          24
pickup_datetime       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
passenger_count       0
year                  0
Month                 0
Date                  0
Day                   0
Hour                  0
Minute                0
dtype: int64

In [38]:
test.isnull().sum()

pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
year                 0
Month                0
Date                 0
Day                  0
Hour                 0
Minute               0
dtype: int64

Analysing the Fare Amount Variable:

In [39]:
train['fare_amount'].sort_values(ascending=False)

1015    54343.0
1072     4343.0
607       453.0
980       434.0
1335      180.0
         ...   
1712        NaN
2412        NaN
2458        NaN
8178        NaN
8226        NaN
Name: fare_amount, Length: 15932, dtype: float64

In [40]:
Counter(train["fare_amount"]<0)

Counter({False: 15929, True: 3})

In [41]:
train = train.drop(train[train['fare_amount']<0].index, axis=0)
train.shape

(15929, 13)

In [42]:
#Checking for negative values as prices cannot be in negative!
train['fare_amount'].min()

0.0

In [43]:
# Removing rows where fare is zero
train = train.drop(train[train['fare_amount']<1].index, axis=0)
train.shape

(15927, 13)

In [44]:
# Removing outliers from frame amount.

train = train.drop(train[train['fare_amount']>454].index, axis=0)
train.shape

(15925, 13)

In [45]:
# Removing rows of the missing values
train = train.drop(train[train["fare_amount"].isnull()].index, axis=0)
print(train.shape)
print(train['fare_amount'].isnull().sum())

(15901, 13)
0


In [48]:
train['fare_amount'].describe()

count    15901.000000
mean        11.376864
std         10.815059
min          1.140000
25%          6.000000
50%          8.500000
75%         12.500000
max        453.000000
Name: fare_amount, dtype: float64