# Hyperparameter Adjustment of the Models

In [100]:
import pandas as pd
from joblib import dump
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split

df = pd.read_csv('/home/pikeblessed/proyecto_phnan/deploy-project-datascience/data/df_processed.csv')
X = df.drop(['reach', 'date', 'engagement'], axis=1)
y = df['reach']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.3, random_state=42)

### Ridge Model

In [67]:
model = Ridge(max_iter=10000)

In [68]:
#define hyperparameters
param_grid_ridge = {'alpha': [0.1, 1, 10, 100],  #alpha values
              'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']} #solution methods

In [69]:
scoring = {'R2': 'r2', 
           'MAE': 'neg_mean_absolute_error'}

#search of best hyperparameters
search_ridge = GridSearchCV(estimator=model, param_grid=param_grid_ridge, cv=5, scoring=scoring, refit='MAE').fit(X_train,y_train)

In [70]:
results = search_ridge.cv_results_

print('Best hyperparameters found: ', search_ridge.best_params_)
print('-'*10)
print('Best score of cross valdiation with NMAE: ', search_ridge.best_score_)
print('-'*10)
print('Best test score of cross valdiation with R2: ', results['mean_test_R2'].mean())

Best hyperparameters found:  {'alpha': 100, 'solver': 'saga'}
----------
Best score of cross valdiation with NMAE:  -605.3251045810097
----------
Best test score of cross valdiation with R2:  0.8592899085105762


In [104]:
best_estimator_ridge = search_ridge.best_estimator_

### Random Forest Model

In [74]:
rf = RandomForestRegressor()

In [96]:
param_grid_rf = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20],       # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],   # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],     # Minimum number of samples required to be at a leaf node
    'max_features': ['sqrt', 'log2'],  # Number of features to consider when looking for the best split
    'bootstrap': [True, False]         # Method of sampling observations for building trees
}

In [98]:
search_rf = RandomizedSearchCV(estimator=rf, param_distributions=param_grid_rf, cv=5, scoring=scoring, refit='MAE', error_score='raise').fit(X_train, y_train)

In [99]:
results = search_rf.cv_results_

print('Best hyperparameters found: ', search_rf.best_params_)
print('-'*10)
print('Best score of cross valdiation with NMAE: ', search_rf.best_score_)
print('-'*10)
print('Best test score of cross valdiation with R2: ', results['mean_test_R2'].mean())

Best hyperparameters found:  {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 10, 'bootstrap': True}
----------
Best score of cross valdiation with NMAE:  -548.9208639401179
----------
Best test score of cross valdiation with R2:  0.8715930578486073


In [103]:
best_estimator_rf = search_rf.best_estimator_

### Gradient Boosting

In [86]:
gbr = GradientBoostingRegressor()

In [92]:
param_grid = {
    'n_estimators': [100, 200, 300],           
    'learning_rate': [0.05, 0.1, 0.2],         
    'max_depth': [3, 4, 5],                    
    'min_samples_split': [2, 5, 10],          
    'min_samples_leaf': [1, 2, 4],             
    'max_features': ['sqrt', 'log2', None],  
    'subsample': [0.8, 0.9, 1.0]             
}

search_gbr = RandomizedSearchCV(estimator=gbr,
                        param_distributions=param_grid,
                        scoring=scoring,
                        cv=5, refit='MAE', n_iter=10).fit(X_train, y_train)

In [101]:
results = search_gbr.cv_results_

print('Best hyperparameters found: ', search_gbr.best_params_)
print('-'*10)
print('Best score of cross valdiation with NMAE: ', search_gbr.best_score_)
print('-'*10)
print('Best test score of cross valdiation with R2: ', results['mean_test_R2'].mean())

Best hyperparameters found:  {'subsample': 0.9, 'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 5, 'learning_rate': 0.05}
----------
Best score of cross valdiation with NMAE:  -592.1344450763828
----------
Best test score of cross valdiation with R2:  0.8407701920715096


In [102]:
best_estimator_gbr = search_gbr.best_estimator_

In [111]:
dump(best_estimator_ridge, '../data_science_pipeline/models_adj_hyperparams/ridge_model.pkl')

['../data_science_pipeline/models_adj_hyperparams/ridge_model.pkl']

In [112]:
dump(best_estimator_rf, '../data_science_pipeline/models_adj_hyperparams/rf_model.pkl')

['../data_science_pipeline/models_adj_hyperparams/rf_model.pkl']

In [113]:
dump(best_estimator_gbr, '../data_science_pipeline/models_adj_hyperparams/gbr_model.pkl')

['../data_science_pipeline/models_adj_hyperparams/gbr_model.pkl']