# Block 6 Exercise 2: finding the best parameters for predicting the fare of taxi rides
We return to our Random Forest Regression and want to automatically optimize all free parameters ...

In [1]:
import pandas as pd
import numpy as np
import folium


In [2]:
# we load the data we have saved after wrangling and pre-processing in block I
X=pd.read_csv('../../DATA/train_cleaned.csv')
drop_columns=['Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1','key','pickup_datetime','pickup_date','pickup_latitude_round3','pickup_longitude_round3','dropoff_latitude_round3','dropoff_longitude_round3']
X=X.drop(drop_columns,axis=1)
X=pd.get_dummies(X)# one hot coding
#generate labels
y=X['fare_amount']
X=X.drop(['fare_amount'],axis=1)

### Scikit Optimize
Scikit Optimize (https://scikit-optimize.github.io/stable/index.html) is a AutoML toolbox wrapped around Scikit-Learn. It allows us to use state-of-the-art automatic hyper-parameter optimization on top of our learning algorithms.   



In [3]:
# install 
!pip install scikit-optimize



### E 2.1 Bayesian Optimization of a Random Forest Regression Model
use Bayesian Optimization with Cross-Validation (https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html#skopt.BayesSearchCV) to find the best regression model. Compare
* linear regression (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression) 
* Random Forest regression (https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor)
* and SVM regression (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn.svm.SVR)

NOTES: this can become quite compute intensive! Hence,
* use a smaller subset of the training data to run the experiments 
* think about the range of your parameters (e.g. larger number of trees in RF or high C-values in SMV will make models expensive)
* optimize only the following parameters per model type:
    * linear: no parameters to optimize
    * RF: #trees and depth
    * SVM: C and gamma (use RBF kernel)
* parallelize -> n_jobs
* use CoLab to rum the job for up to 12h 


In [4]:
print('X: ',np.shape(X))
print('y: ',np.shape(y))

X:  (400000, 31)
y:  (400000,)


In [9]:
X_cut = X[:3000]
y_cut = y[:3000]

print('X: ',np.shape(X_cut))
print('y: ',np.shape(y_cut))

X:  (3000, 31)
y:  (3000,)


### preload

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from skopt import BayesSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from skopt.space import Real, Categorical, Integer
from sklearn.datasets import load_iris
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


X_train, X_test, y_train, y_test = train_test_split(X_cut, y_cut ,test_size = 0.1, random_state=42)

### linear regression

In [11]:
%%time
reg = LinearRegression().fit(X_train, y_train)

print('Train socre: ', reg.score(X_train, y_train))
print('Test socre: ', reg.score(X_test, y_test))

Train socre:  0.7857188744561222
Test socre:  0.7829911081690426
Wall time: 20 ms


### Random Forest regression

In [12]:
%%time
opt = BayesSearchCV(RandomForestRegressor(),
     {
         'n_estimators': Integer(1,1000),
         'max_depth': Integer(2,1000),
     },
     n_iter=50,
     random_state=42,
     n_jobs=-1)

# executes bayesian optimization
print(opt.fit(X_train, y_train))

# model can be saved, used for predictions or scoring
#print(opt.score(X_train, y_train))
print('---Best Score---')
print(opt.best_score_)
print('\n---Best Parameter---')
print(opt.best_params_)

BayesSearchCV(estimator=RandomForestRegressor(), n_jobs=-1, random_state=42,
              search_spaces={'max_depth': Integer(low=2, high=1000, prior='uniform', transform='identity'),
                             'n_estimators': Integer(low=1, high=1000, prior='uniform', transform='identity')})
---Best Score---
0.7240056477411404

---Best Parameter---
OrderedDict([('max_depth', 6), ('n_estimators', 818)])
Wall time: 11min 55s


### SVM

In [13]:
%%time
regr_pip = make_pipeline(StandardScaler(),SVR(kernel='rbf')) 

regr_pip.get_params().keys()

Wall time: 0 ns


dict_keys(['memory', 'steps', 'verbose', 'standardscaler', 'svr', 'standardscaler__copy', 'standardscaler__with_mean', 'standardscaler__with_std', 'svr__C', 'svr__cache_size', 'svr__coef0', 'svr__degree', 'svr__epsilon', 'svr__gamma', 'svr__kernel', 'svr__max_iter', 'svr__shrinking', 'svr__tol', 'svr__verbose'])

In [None]:
%%time
opt = BayesSearchCV(estimator=regr_pip,search_spaces =
     {
         'svr__C': Real(0.001,100),
         'svr__gamma': Categorical(['scale', 'auto']),
     },
     n_iter=50,
     random_state=42,
     n_jobs=-1
     )

opt.fit(X_train, y_train)

#print(regr.score(X_test, y_test))
print('---Best Score---')
print(opt.best_score_)
print('\n---Best Parameter---')
print(opt.best_params_)