### Importing Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')

### Importing Data

In [2]:
data = pd.read_csv("TaxiFare.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'TaxiFare.csv'

### Cleaning Data

In [None]:
data['date_time_of_pickup'] = pd.to_datetime(data['date_time_of_pickup'])
data['day'] = data['date_time_of_pickup'].dt.day
data['month'] = data['date_time_of_pickup'].dt.month
data['year'] = data['date_time_of_pickup'].dt.year

In [None]:
x = data.drop(columns=['unique_id', 'date_time_of_pickup', 'amount'], axis=1)
heat_df = data.drop(columns=['unique_id', 'date_time_of_pickup'], axis=1)
y = data['amount']

### Splitting Dataset into train and test subsets

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=0)

### Using Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(random_state=0)
regr.fit(x_train, y_train)
y_pred = regr.predict(x_test)

### Metrics

In [None]:
from sklearn.metrics import r2_score, mean_squared_error
acc = r2_score(y_test, y_pred) * 100
print("Accuracy is {}%".format(round(acc,2)))
print("Mean Squared Error is {}%".format(round(mean_squared_error(y_test, y_pred),2)))

### Actual vs Predicted Values

In [None]:
df=pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
print(df.head(10))

### Visualising Feature Importances

In [None]:
print(regr.feature_importances_)

In [None]:
sorted_idx=regr.feature_importances_.argsort()[::-1]
y_values = list(x.columns[sorted_idx])
sns.barplot(x=regr.feature_importances_[sorted_idx], y=y_values, palette="Spectral")
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()

In [None]:
sns.heatmap(heat_df.corr(), annot=True, linewidths=0.5, fmt=('0.2f'))

### Using GridSearchCV for hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid= {'max_features': ['auto', 'sqrt'],
             'max_depth': [None,1,10],
             'min_samples_split': [2,5,8,10],
             'min_samples_leaf': [1, 2, 4, 6],
             'bootstrap': [True, False]
}
grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=20), param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1)  

In [None]:
grid_search.fit(x_train,y_train)

In [None]:
print(grid_search.best_params_)

In [None]:
grid_predictions = grid_search.predict(x_test)
acc = r2_score(y_test, grid_predictions) * 100
print("Accuracy is {}%".format(round(acc,2)))
print("Mean Squared Error is {}%".format(round(mean_squared_error(y_test, grid_predictions),2)))

### Using RandomizedSearchCV for hyperparameter tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
param_grid= {'max_features': ['auto', 'sqrt'],
             'max_depth': [None,1,5],
             'min_samples_split': [2,5,8,10],
             'min_samples_leaf': [1, 2, 4, 6],
             'bootstrap': [True, False]
}
random_search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=40), param_distributions=param_grid,
n_iter = 100, scoring='neg_mean_squared_error', cv=3, verbose=2, random_state=40,n_jobs=-1)  

In [None]:
random_search.fit(x_train,y_train)

In [None]:
print(random_search.best_params_)

In [None]:
random_predictions = random_search.predict(x_test)
acc = r2_score(y_test, random_predictions) * 100
print("Accuracy is {}%".format(round(acc,2)))
print("Mean Squared Error is {}%".format(round(mean_squared_error(y_test, random_predictions),2)))