### Manual

- Changing Parameters 
- Choose the model to optimize
- search the documentation
- identify possible adjustments
- test combinations one by one by iterating through lists

### GridSearchCV (when we have a lot of time)

- Grid optimization
- Define one or more metrics that we want to optimize
- identify the possible values that the parameters may have
- create a parameter dictionary
- use cross validation


### RandomizedSearchCV (when we don't have much time)

- Random optimization
- Define one or more metrics that we want to optimize
- identify the ranges of values values that can take certain parameters
- create a value range dictionary
- use cross validation


<img src="https://miro.medium.com/max/1004/0*yDmmJmvRowl0cSN8.png">

In [1]:
import pandas as pd 
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [2]:
df = pd.read_csv('../Datasets/Week9/world_happiness.csv')
df.head()

Unnamed: 0,country,rank,score,high,low,gdp,family,lifexp,freedom,generosity,corruption,dystopia
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707
2,Iceland,3,7.504,7.62203,7.38597,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715
3,Switzerland,4,7.494,7.561772,7.426227,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 12 columns):
country       155 non-null object
rank          155 non-null int64
score         155 non-null float64
high          155 non-null float64
low           155 non-null float64
gdp           155 non-null float64
family        155 non-null float64
lifexp        155 non-null float64
freedom       155 non-null float64
generosity    155 non-null float64
corruption    155 non-null float64
dystopia      155 non-null float64
dtypes: float64(10), int64(1), object(1)
memory usage: 14.7+ KB


In [4]:
X = df.iloc[:, 3:]
y = df.iloc[:, 3]


reg = RandomForestRegressor()

params = {
    'n_estimators': range(4,16),
    'criterion': ['mse', 'mae'],
    'max_depth': range(2,12)
} 


rand_est = RandomizedSearchCV(reg, params, n_iter=10, cv=3, scoring='neg_mean_squared_error', iid=True).fit(X,y)

print(rand_est.best_estimator_)
print(rand_est.best_params_)
print(rand_est.best_score_)
print(rand_est.predict(X.loc[[0]]))

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=11,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=9, n_jobs=None,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)
{'n_estimators': 9, 'max_depth': 11, 'criterion': 'mae'}
-0.5767009696478449
[7.57753926]


In [5]:
reg = RandomForestRegressor()

params = {
    'n_estimators': [4,8,16],
    'criterion': ['mse', 'mae'],
    'max_depth':[2,3,4,6,8]
} 


grid_est = GridSearchCV(reg, params, cv=3, scoring='neg_mean_squared_error',iid=True).fit(X,y)

print(grid_est.best_estimator_)
print(grid_est.best_params_)
print(grid_est.best_score_)
print(grid_est.predict(X.loc[[0]]))

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=8, n_jobs=None,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)
{'criterion': 'mse', 'max_depth': 6, 'n_estimators': 8}
-0.5807896341214042
[7.59484835]


In [6]:
reg = SVR()

params = { 
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf'],
    'C': [1, 1.1],
    'epsilon': [0.1, 0.5]
} 


grid_est = GridSearchCV(reg, params, cv=3, scoring='neg_mean_squared_error',iid=True).fit(X,y)

print(grid_est.best_estimator_)
print(grid_est.best_params_)
print(grid_est.best_score_)
print(grid_est.predict(X.loc[[0]]))

SVR(C=1, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
{'C': 1, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
-0.006498826474663803
[7.60467064]


In [7]:
from sklearn.metrics import r2_score
svr = SVR(C= 1, epsilon=0.1, gamma='scale', kernel='linear').fit(X,y)
y_pred=svr.predict(X)
r2_score(y_pred,y)

0.9974924787913797