In [1]:
# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt

# Preprocesado y modelado
# ==============================================================================
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.inspection import permutation_importance
import multiprocessing

# Configuración warnings
# ==============================================================================
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
dataset_dir = 'Dataset_Final/Dataset_unido.csv' #Se carga el dataset
datos = pd.read_csv(dataset_dir, sep=(',')) #Se lee el archivo csv el cual está separado por comas
datos #Se imprime el dataset

Unnamed: 0,CodDepartamento,Ano,Trimestre,PorcentajeDesempleo,Temperatura,Precipitacion,PIB,retail_and_recreation,grocery_and_pharmacy,parks,transit_stations,workplaces,residential,PorcentajeVacunacion,Vulnerabilidad_numero,Incidencia
0,5,2020,1,12.9,24.149651,7.957837,46018.0,-12.489606,-8.493907,-11.392832,-9.280287,-5.809319,4.227240,0.00,2.0,0.000024
1,5,2020,2,25.2,22.286968,5.173339,46018.0,-67.037634,-44.896057,-55.349462,-62.886022,-50.243011,25.524731,0.00,2.0,0.000825
2,5,2020,3,20.0,22.739094,7.672982,46018.0,-50.040860,-28.913620,-39.594624,-51.555556,-34.839427,17.956272,0.00,2.0,0.026520
3,5,2020,4,15.2,23.157370,8.176094,46018.0,-28.945878,-3.579928,-27.370968,-28.567742,-19.315054,10.191756,0.00,2.0,0.033072
4,5,2021,1,17.9,22.285123,6.276447,52347.0,-32.214670,-0.670507,-31.100230,-34.711982,-20.892857,9.510369,0.66,2.0,0.024737
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,88,2020,4,25.3,25.658387,0.000000,1027.0,-57.937993,-29.239427,-60.999642,-41.625090,-32.145878,13.854480,0.00,1.0,0.014034
188,88,2021,1,16.0,26.667411,0.000000,1300.0,-36.820661,-11.887865,-48.219662,-15.671659,-21.868280,9.530722,0.51,1.0,0.005350
189,88,2021,2,16.0,28.111204,0.000000,1300.0,-23.432616,-3.455914,-47.115412,-0.963082,-16.767384,11.880645,13.53,1.0,0.061390
190,88,2021,3,11.9,23.849370,0.000000,1300.0,1.812545,11.936918,-38.234050,31.749821,-7.212903,6.475986,56.77,1.0,0.006413


In [3]:
y = datos["Incidencia"]
X = datos.iloc[:,:-1]
X

Unnamed: 0,CodDepartamento,Ano,Trimestre,PorcentajeDesempleo,Temperatura,Precipitacion,PIB,retail_and_recreation,grocery_and_pharmacy,parks,transit_stations,workplaces,residential,PorcentajeVacunacion,Vulnerabilidad_numero
0,5,2020,1,12.9,24.149651,7.957837,46018.0,-12.489606,-8.493907,-11.392832,-9.280287,-5.809319,4.227240,0.00,2.0
1,5,2020,2,25.2,22.286968,5.173339,46018.0,-67.037634,-44.896057,-55.349462,-62.886022,-50.243011,25.524731,0.00,2.0
2,5,2020,3,20.0,22.739094,7.672982,46018.0,-50.040860,-28.913620,-39.594624,-51.555556,-34.839427,17.956272,0.00,2.0
3,5,2020,4,15.2,23.157370,8.176094,46018.0,-28.945878,-3.579928,-27.370968,-28.567742,-19.315054,10.191756,0.00,2.0
4,5,2021,1,17.9,22.285123,6.276447,52347.0,-32.214670,-0.670507,-31.100230,-34.711982,-20.892857,9.510369,0.66,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,88,2020,4,25.3,25.658387,0.000000,1027.0,-57.937993,-29.239427,-60.999642,-41.625090,-32.145878,13.854480,0.00,1.0
188,88,2021,1,16.0,26.667411,0.000000,1300.0,-36.820661,-11.887865,-48.219662,-15.671659,-21.868280,9.530722,0.51,1.0
189,88,2021,2,16.0,28.111204,0.000000,1300.0,-23.432616,-3.455914,-47.115412,-0.963082,-16.767384,11.880645,13.53,1.0
190,88,2021,3,11.9,23.849370,0.000000,1300.0,1.812545,11.936918,-38.234050,31.749821,-7.212903,6.475986,56.77,1.0


In [4]:
# División de los datos en train y test
# ==============================================================================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
                                                    random_state = 117)


In [5]:
# Creación del modelo
# ==============================================================================

modelo = AdaBoostRegressor(random_state=329)

# Entrenamiento del modelo
# ==============================================================================
modelo.fit(X_train, y_train)

In [6]:
predicciones = modelo.predict(X = X_test)
rmse = mean_squared_error(
        y_true  = y_test,
        y_pred  = predicciones,
        squared = False
       )
print(f"El error (rmse) de test es: {rmse}")
print("R2: ",r2_score(y_test,predicciones))

El error (rmse) de test es: 0.008446283651972137
R2:  0.7618827701738508


In [7]:
modelo_1 = AdaBoostRegressor(
        DecisionTreeRegressor(criterion='absolute_error', 
                               max_depth=4,
                               max_features='auto',
                               random_state=329, 
                               ccp_alpha=7.179487e-06),
                               
            n_estimators=300,random_state=329)
modelo_1.fit(X_train, y_train)

In [8]:
predicciones = modelo_1.predict(X = X_test)
rmse = mean_squared_error(
        y_true  = y_test,
        y_pred  = predicciones,
        squared = False
       )
print(f"El error (rmse) de test es: {rmse}")
print("R2: ",r2_score(y_test,predicciones))

El error (rmse) de test es: 0.007999179743364416
R2:  0.786425003397119


In [9]:
parameters = {'n_estimators' : range(220, 250, 1)
             }

In [10]:
grid = GridSearchCV(AdaBoostRegressor(
                               DecisionTreeRegressor(criterion='absolute_error', 
                               max_depth=4,
                               max_features='auto',
                               random_state=329, 
                               ccp_alpha=7.179487e-06),
                               random_state=329)
                    ,parameters)
model = grid.fit(X_train,y_train)
print(model.best_params_,'\n')
print(model.best_estimator_,'\n')

{'n_estimators': 234} 

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(ccp_alpha=7.179487e-06,
                                                       criterion='absolute_error',
                                                       max_depth=4,
                                                       max_features='auto',
                                                       random_state=329),
                  n_estimators=234, random_state=329) 



In [11]:
# Resultados
# ==============================================================================
resultados = pd.DataFrame(model.cv_results_)
resultados.filter(regex = '(param.*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False) \
    .head(4)

Unnamed: 0,param_n_estimators,mean_test_score,std_test_score
14,234,0.648693,0.02857
15,235,0.648041,0.029455
20,240,0.647667,0.025342
12,232,0.647617,0.026344


In [12]:
model_n = model.best_estimator_
model_n

In [13]:
model = model_n.fit(X_train,y_train)
predicciones = model_n.predict(X = X_test)

rmse = mean_squared_error(
        y_true  = y_test,
        y_pred  = predicciones,
        squared = False
       )

print(f"El error (rmse) de test es: ", rmse)
print("R2: ",r2_score(y_test,predicciones))

El error (rmse) de test es:  0.007785043998121836
R2:  0.7977066344982191


In [14]:
# prepare the cross-validation procedure
cv = KFold(n_splits=5)

#evaluate model
scores = cross_val_score(model_n, X_train, y_train, scoring='r2', cv=cv, n_jobs=-1)

# Acá indicamos cuantos fold queremos.
print("R2 cv:", np.mean(scores))

R2 cv: 0.6486925044029578


In [15]:
df_1 = pd.DataFrame(grid.cv_results_).set_index('rank_test_score').sort_index()
df_1.head(3)

Unnamed: 0_level_0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,1.546009,0.073016,0.037006,0.004696,234,{'n_estimators': 234},0.685278,0.674894,0.649971,0.617756,0.615563,0.648693,0.02857
2,1.733561,0.188981,0.047117,0.022509,235,{'n_estimators': 235},0.685285,0.674869,0.650337,0.618289,0.611424,0.648041,0.029455
3,1.915505,0.102405,0.043605,0.009391,240,{'n_estimators': 240},0.680222,0.668752,0.651622,0.622888,0.61485,0.647667,0.025342


In [16]:
importancia_predictores = pd.DataFrame(
                            {'predictor': datos.drop(columns = "Incidencia").columns,
                             'importancia': modelo_1.feature_importances_}
                            )
print("Importancia de los predictores en el modelo")
print("-------------------------------------------")
importancia_predictores.sort_values('importancia', ascending=False)

Importancia de los predictores en el modelo
-------------------------------------------


Unnamed: 0,predictor,importancia
13,PorcentajeVacunacion,0.196731
12,residential,0.186605
7,retail_and_recreation,0.10375
11,workplaces,0.080952
9,parks,0.079101
8,grocery_and_pharmacy,0.06765
5,Precipitacion,0.05969
3,PorcentajeDesempleo,0.052787
4,Temperatura,0.048848
2,Trimestre,0.031923
