In [1]:
# Importamos las librerías necesarias
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, KFold

# Implementación de K-Folds Cross Validation

In [2]:
# Cargamos los datos
df = pd.read_csv('../Data/2019.csv')
df.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298


In [4]:
# Seleccionamos las variables predictoras y la variable objetivo
X = df.drop(['Country or region', 'Score'], axis=1)
y = df['Score']

In [7]:
# Implementación sencilla
# Creamos el modelo
print('---- Easy Implementation ----')
model = DecisionTreeRegressor()
score = cross_val_score(model, X, y, cv=3, scoring='neg_mean_squared_error')
print(f'''
     Score: {score}
Mean Score: {np.abs(score.mean())}''')
print('-'*30)

---- Easy Implementation ----

     Score: [-0.77182115 -0.1550394  -0.66204137]
Mean Score: 0.5296339743589743
------------------------------


In [9]:
%run "Funciones Auxiliares/Funciones_Auxiliares.ipynb"

In [14]:
# Implementación completa
from sklearn.metrics import mean_squared_error
print('---- Full Implementation ----')
# Creamos el modelo
model = DecisionTreeRegressor()
# Creamos el objeto KFold
kf = KFold(n_splits=3, shuffle=True, random_state=42)
# Iteramos
scores = []
for n_fold, (train, test) in enumerate(kf.split(X)):
    print(f'Fold {n_fold + 1}')
    # Creamos los conjuntos de entrenamiento y validación
    X_train, X_test, y_train, y_test = train_test_split_kf(X.values, y.values, train, test)
    # Evaluamos el modelo
    score = evaluate_model(model, mean_squared_error, X_train, X_test, y_train, y_test)
    scores.append(score)
    print(f'Score: {score}')
    print('-'*30)
print(f'Mean Score: {np.abs(np.mean(scores))}')

---- Full Implementation ----
Fold 1
Score: 0.005480423076923086
------------------------------
Fold 2
Score: 0.006564826923076931
------------------------------
Fold 3
Score: 0.006020153846153837
------------------------------
Mean Score: 0.006021801282051285


# Implementación de Randomized

In [16]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

# verificamos nuestro dataset
df

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.769,1.340,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.600,1.383,1.573,0.996,0.592,0.252,0.410
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.380,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298
...,...,...,...,...,...,...,...,...,...
151,152,Rwanda,3.334,0.359,0.711,0.614,0.555,0.217,0.411
152,153,Tanzania,3.231,0.476,0.885,0.499,0.417,0.276,0.147
153,154,Afghanistan,3.203,0.350,0.517,0.361,0.000,0.158,0.025
154,155,Central African Republic,3.083,0.026,0.000,0.105,0.225,0.235,0.035


In [25]:
# Seleccionamos las variables predictoras y la variable objetivo
X = df.drop(['Country or region', 'Overall rank' ,'Score'], axis=1)
y = df['Score']

# Definimos el modelo
reg = RandomForestRegressor()

# Definimos los parámetros
params = {
    'n_estimators': range(4, 16), # número de árboles
    'criterion': ['friedman_mse', 'squared_error', 'poisson', 'absolute_error'], # criterio de división
    'max_depth': range(2, 11) # profundidad máxima
}

# Creamos el objeto RandomizedSearchCV
random_search = RandomizedSearchCV(reg, params, n_iter=10, cv=3, scoring='neg_mean_squared_error')

# Entrenamos el modelo
random_search.fit(X, y)


# imprimimos los mejores estimadores, los mejores parámetros y la predicción
print(f'''
Best Estimator:
{random_search.best_estimator_}
{'-'*50}
Best Params: 
{random_search.best_params_}
{'-'*50}
Prediction:
{random_search.predict(X.loc[[0]])}
Real Value:
{y.loc[0]}''')



Best Estimator:
RandomForestRegressor(max_depth=4, n_estimators=15)
--------------------------------------------------
Best Params: 
{'n_estimators': 15, 'max_depth': 4, 'criterion': 'squared_error'}
--------------------------------------------------
Prediction:
[7.32166143]
Real Value:
7.769
