In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [37]:
data = pd.read_csv("TaxiFare.csv")

In [39]:
data.head()


Unnamed: 0,unique_id,amount,date_time_of_pickup,longitude_of_pickup,latitude_of_pickup,longitude_of_dropoff,latitude_of_dropoff,no_of_passenger
0,26:21.0,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,52:16.0,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,35:00.0,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,30:42.0,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,51:00.0,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [41]:
data.isnull().sum()

unique_id               0
amount                  0
date_time_of_pickup     0
longitude_of_pickup     0
latitude_of_pickup      0
longitude_of_dropoff    0
latitude_of_dropoff     0
no_of_passenger         0
dtype: int64

In [43]:
data.drop(['unique_id', 'date_time_of_pickup'], axis = 1, inplace = True)

In [45]:
x = data.drop('amount', axis=1)
y = data['amount']

In [47]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 42)

In [49]:
rf = RandomForestRegressor(random_state=42)

In [51]:
rf.fit(x_train, y_train)

In [60]:
y_pred = rf.predict(x_test)

In [62]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print('Mean Absolute Error: ', mae)
print('Mean Squared Error: ', mse)
print('Root Mean Squared Error: ', rmse)

Mean Absolute Error:  2.3545234655463587
Mean Squared Error:  25.000861269493686
Root Mean Squared Error:  5.000086126207596


In [64]:
r2 = r2_score(y_test, y_pred)
print('R2 Score: ', r2)

R2 Score:  0.731089434077931


In [66]:
param_grid = {
'n_estimators' : [10, 50, 100, 150],
'max_depth' : [None, 5, 10, 15],
'min_samples_split' : [2, 5, 8, 10]
}

In [68]:
rf1 = RandomForestRegressor(random_state=42)

In [70]:
grid_search = GridSearchCV(estimator= rf1, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

In [72]:
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


In [73]:
print('Best Parameters: ', grid_search.best_params_)

Best Parameters:  {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 150}


In [74]:
best_model = grid_search.best_estimator_
pred = best_model.predict(x_test)

In [75]:
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)
print('Mean Absolute Error: ', mae)
print('Mean Squared Error: ', mse)
print('Root Mean Squared Error: ', rmse)

Mean Absolute Error:  2.3357314979818207
Mean Squared Error:  24.42188966216272
Root Mean Squared Error:  4.941850833661688


In [76]:
r2 = r2_score(y_test, pred)
print('R2 Score: ', r2)

R2 Score:  0.7373168828406724


In [77]:
import folium

ModuleNotFoundError: No module named 'folium'