# Cross Validation

In [240]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

df = pd.read_csv('/home/pikeblessed/proyecto_phnan/deploy-project-datascience/data/df_processed.csv')
X = df.drop(['reach', 'date', 'engagement'], axis=1)
y = df['reach']

### Ridge Model

In [241]:
cross_v_ridge = cross_validate(Ridge(alpha=10), X, y, cv=5, return_train_score=True)

In [242]:
#print mean train and test scores of ridge model
print(cross_v_ridge['train_score'].mean()) 
print(cross_v_ridge['test_score'].mean()) 

0.9010187594488759
0.7870649569245625


### Decision Tree

In [186]:
cross_v_tree = cross_validate(DecisionTreeRegressor(), X, y, cv=5, return_train_score=True)

In [187]:
#print mean train and test scores of decision tree model
print(cross_v_tree['train_score'].mean()) 
print(cross_v_tree['test_score'].mean()) 

0.9999799985520227
0.6893778191115757


### Random Forest

In [188]:
cross_v_rf = cross_validate(RandomForestRegressor(), X, y, cv=5, return_train_score=True)

In [189]:
#print mean train and test scores of random forest model
print(cross_v_rf['train_score'].mean()) 
print(cross_v_rf['test_score'].mean()) 

0.9816207796232057
0.8003298535985319


### Gradient Boosting

In [190]:
cross_v_gbr = cross_validate(GradientBoostingRegressor(), X, y, cv=5, return_train_score=True)

In [191]:
#print mean train and test scores of gradient boosting model
print(cross_v_gbr['train_score'].mean()) 
print(cross_v_gbr['test_score'].mean()) 

0.9888342695498945
0.795459464185128


## Aplying Cross Validation to Best Models

In [192]:
from joblib import load

In [193]:
folder_path = "../data_science_pipeline/models_adj_hyperparams/"
model_names = ['gbr_model.pkl', 'rf_model.pkl', 'ridge_model.pkl']

In [194]:
loaded_models = []

for model_name in model_names:
    model_path = folder_path+model_name
    loaded_model = load(model_path)
    loaded_models.append(loaded_model)

### Ridge Model

In [203]:
cross_v_ridge_2 = cross_validate(loaded_models[2], X, y, cv=5, return_train_score=True)

In [204]:
print(cross_v_ridge_2['train_score'].mean()) 
print(cross_v_ridge_2['test_score'].mean())

0.9011085832362342
0.7858874532420254


### Random Forest

In [197]:
cross_v_rf = cross_validate(loaded_models[1], X, y, cv=5, return_train_score=True)

In [198]:
print(cross_v_rf['train_score'].mean()) 
print(cross_v_rf['test_score'].mean())

0.9569286886773926
0.7705868115480831


### Gradient Boosting

In [199]:
cross_v_gbr = cross_validate(loaded_models[0], X, y, cv=5, return_train_score=True)

In [200]:
print(cross_v_gbr['train_score'].mean()) 
print(cross_v_gbr['test_score'].mean())

0.9919303793336596
0.7666323995001207
