In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [2]:
data = pd.read_csv('TaxiFare.csv')

In [3]:
data

Unnamed: 0,unique_id,amount,date_time_of_pickup,longitude_of_pickup,latitude_of_pickup,longitude_of_dropoff,latitude_of_dropoff,no_of_passenger
0,26:21.0,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.841610,40.712278,1
1,52:16.0,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,35:00.0,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.761270,-73.991242,40.750562,2
3,30:42.0,7.7,2012-04-21 04:30:42 UTC,-73.987130,40.733143,-73.991567,40.758092,1
4,51:00.0,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1
...,...,...,...,...,...,...,...,...
49995,25:15.0,15.0,2013-06-12 23:25:15 UTC,-73.999973,40.748531,-74.016899,40.705993,1
49996,19:18.0,7.5,2015-06-22 17:19:18 UTC,-73.984756,40.768211,-73.987366,40.760597,1
49997,53:00.0,6.9,2011-01-30 04:53:00 UTC,-74.002698,40.739428,-73.998108,40.759483,1
49998,09:00.0,4.5,2012-11-06 07:09:00 UTC,-73.946062,40.777567,-73.953450,40.779687,2


In [4]:
data.isnull().sum()

unique_id               0
amount                  0
date_time_of_pickup     0
longitude_of_pickup     0
latitude_of_pickup      0
longitude_of_dropoff    0
latitude_of_dropoff     0
no_of_passenger         0
dtype: int64

In [5]:
data.dtypes

unique_id                object
amount                  float64
date_time_of_pickup      object
longitude_of_pickup     float64
latitude_of_pickup      float64
longitude_of_dropoff    float64
latitude_of_dropoff     float64
no_of_passenger           int64
dtype: object

In [6]:
data.describe()

Unnamed: 0,amount,longitude_of_pickup,latitude_of_pickup,longitude_of_dropoff,latitude_of_dropoff,no_of_passenger
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,11.364171,-72.509756,39.933759,-72.504616,39.926251,1.66784
std,9.685557,10.39386,6.224857,10.40757,6.014737,1.289195
min,-5.0,-75.423848,-74.006893,-84.654241,-74.006377,0.0
25%,6.0,-73.992062,40.73488,-73.991152,40.734372,1.0
50%,8.5,-73.98184,40.752678,-73.980082,40.753372,1.0
75%,12.5,-73.967148,40.76736,-73.963584,40.768167,2.0
max,200.0,40.783472,401.083332,40.851027,43.41519,6.0


In [7]:
data.drop(['unique_id', 'date_time_of_pickup'], axis = 1, inplace = True)

In [8]:
data.isnull().sum()

amount                  0
longitude_of_pickup     0
latitude_of_pickup      0
longitude_of_dropoff    0
latitude_of_dropoff     0
no_of_passenger         0
dtype: int64

In [9]:
x = data.drop('amount', axis=1)
y = data['amount']

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [11]:
rf = RandomForestRegressor(random_state=42)

In [12]:
rf.fit(x_train, y_train)

In [13]:
y_pred = rf.predict(x_test)

In [14]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print('Mean Absolute Error: ', mae)
print('Mean Squared Error: ', mse)
print('Root Mean Squared Error: ', rmse)

Mean Absolute Error:  2.3545234655463587
Mean Squared Error:  25.000861269493686
Root Mean Squared Error:  5.000086126207596


In [15]:
r2 = r2_score(y_test, y_pred)
print('R2 Score: ', r2)

R2 Score:  0.731089434077931


In [16]:
param_grid  = {
    'n_estimators' : [10, 50, 100, 150],
    'max_depth' : [None, 5, 10, 15],
    'min_samples_split' : [2, 5, 8, 10]
}

In [17]:
rf1 = RandomForestRegressor(random_state=42)

In [18]:
grid_search = GridSearchCV(estimator= rf1, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

In [19]:
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


In [20]:
print('Best Parameters: ', grid_search.best_params_)

Best Parameters:  {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 150}


In [21]:
best_model = grid_search.best_estimator_
pred = best_model.predict(x_test)

In [22]:
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)
print('Mean Absolute Error: ', mae)
print('Mean Squared Error: ', mse)
print('Root Mean Squared Error: ', rmse)

Mean Absolute Error:  2.3357314979818207
Mean Squared Error:  24.42188966216272
Root Mean Squared Error:  4.941850833661688


In [23]:
r2 = r2_score(y_test, pred)
print('R2 Score: ', r2)

R2 Score:  0.7373168828406724


In [24]:
import folium

In [25]:
map_center = [40.712278, -73.84161]
zoom_level = 10

map_object = folium.Map(location=map_center, zoom_start=zoom_level)

In [26]:
latitutde_list = [40.712278, 40.782004]
longitude_list = [-73.84161, -73.979268]

In [27]:
for lat, lon in zip(latitutde_list, longitude_list):
    folium.Marker([lat, lon], popup=f'Lat: {lat}, Lon: {lon}').add_to(map_object)

map_object