In [1]:
# Tratamiento de datos
# ==============================================================================
import pandas as pd
import numpy as np
from numpy import std

# Graficas
# ==============================================================================
import matplotlib.pyplot as plt
import seaborn as sns

# Algoritmos
# ==============================================================================
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

# Metricas de evaluación
# ==============================================================================
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Tiempo
# ==============================================================================
from time import time

# Entrenamiento
# ==============================================================================
from sklearn.model_selection import KFold 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# Configuración warnings
# ==============================================================================
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
dataset_dir = 'Dataset_Final/Dataset_unido.csv' #Se carga el dataset
df = pd.read_csv(dataset_dir, sep=(',')) #Se lee el archivo csv el cual está separado por comas
df #Se imprime el dataset

Unnamed: 0,CodDepartamento,Ano,Trimestre,PorcentajeDesempleo,Temperatura,Precipitacion,PIB,retail_and_recreation,grocery_and_pharmacy,parks,transit_stations,workplaces,residential,PorcentajeVacunacion,Vulnerabilidad_numero,Incidencia
0,5,2020,1,12.9,24.149651,7.957837,46018.0,-12.489606,-8.493907,-11.392832,-9.280287,-5.809319,4.227240,0.00,2.0,0.000024
1,5,2020,2,25.2,22.286968,5.173339,46018.0,-67.037634,-44.896057,-55.349462,-62.886022,-50.243011,25.524731,0.00,2.0,0.000825
2,5,2020,3,20.0,22.739094,7.672982,46018.0,-50.040860,-28.913620,-39.594624,-51.555556,-34.839427,17.956272,0.00,2.0,0.026520
3,5,2020,4,15.2,23.157370,8.176094,46018.0,-28.945878,-3.579928,-27.370968,-28.567742,-19.315054,10.191756,0.00,2.0,0.033072
4,5,2021,1,17.9,22.285123,6.276447,52347.0,-32.214670,-0.670507,-31.100230,-34.711982,-20.892857,9.510369,0.66,2.0,0.024737
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,88,2020,4,25.3,25.658387,0.000000,1027.0,-57.937993,-29.239427,-60.999642,-41.625090,-32.145878,13.854480,0.00,1.0,0.014034
188,88,2021,1,16.0,26.667411,0.000000,1300.0,-36.820661,-11.887865,-48.219662,-15.671659,-21.868280,9.530722,0.51,1.0,0.005350
189,88,2021,2,16.0,28.111204,0.000000,1300.0,-23.432616,-3.455914,-47.115412,-0.963082,-16.767384,11.880645,13.53,1.0,0.061390
190,88,2021,3,11.9,23.849370,0.000000,1300.0,1.812545,11.936918,-38.234050,31.749821,-7.212903,6.475986,56.77,1.0,0.006413


In [3]:
#Se separan las columnas para la evaluación
y = df["Incidencia"] #Variable dependiente
X = df.iloc[:,:-1] #Variables independientes
X

Unnamed: 0,CodDepartamento,Ano,Trimestre,PorcentajeDesempleo,Temperatura,Precipitacion,PIB,retail_and_recreation,grocery_and_pharmacy,parks,transit_stations,workplaces,residential,PorcentajeVacunacion,Vulnerabilidad_numero
0,5,2020,1,12.9,24.149651,7.957837,46018.0,-12.489606,-8.493907,-11.392832,-9.280287,-5.809319,4.227240,0.00,2.0
1,5,2020,2,25.2,22.286968,5.173339,46018.0,-67.037634,-44.896057,-55.349462,-62.886022,-50.243011,25.524731,0.00,2.0
2,5,2020,3,20.0,22.739094,7.672982,46018.0,-50.040860,-28.913620,-39.594624,-51.555556,-34.839427,17.956272,0.00,2.0
3,5,2020,4,15.2,23.157370,8.176094,46018.0,-28.945878,-3.579928,-27.370968,-28.567742,-19.315054,10.191756,0.00,2.0
4,5,2021,1,17.9,22.285123,6.276447,52347.0,-32.214670,-0.670507,-31.100230,-34.711982,-20.892857,9.510369,0.66,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,88,2020,4,25.3,25.658387,0.000000,1027.0,-57.937993,-29.239427,-60.999642,-41.625090,-32.145878,13.854480,0.00,1.0
188,88,2021,1,16.0,26.667411,0.000000,1300.0,-36.820661,-11.887865,-48.219662,-15.671659,-21.868280,9.530722,0.51,1.0
189,88,2021,2,16.0,28.111204,0.000000,1300.0,-23.432616,-3.455914,-47.115412,-0.963082,-16.767384,11.880645,13.53,1.0
190,88,2021,3,11.9,23.849370,0.000000,1300.0,1.812545,11.936918,-38.234050,31.749821,-7.212903,6.475986,56.77,1.0


In [4]:
# División de los datos en train 70% y test 30%
# ==============================================================================
X_train, X_test, Y_train, Y_test = train_test_split(
                                        df.drop(columns = "Incidencia"),
                                        df['Incidencia'],
                                        test_size = 0.3 ,
                                        random_state = 117
                                    )


In [5]:
#Vector que contiene los diferentes algorítmos
#Se establece un valor de random_state igual para aquellos que contienen este parámetro
regressors = [
    LinearRegression(),
    DecisionTreeRegressor(max_depth=7,random_state=329),
    KNeighborsRegressor(),
    SVR(),
    RandomForestRegressor(random_state=329),
    GradientBoostingRegressor(random_state=329),
    ExtraTreesRegressor(random_state=329),
    AdaBoostRegressor(random_state=329)
]

In [6]:
head = 8 #Cantidad de algorítmos

#Se crea un ciclo for para evaluar cada algoritmo
for model in regressors[:head]:
    start = time() #Inicialización de la variable tiempo
    model.fit(X_train, Y_train) #Entrenamiento
    train_time = time() - start #Tiempo de entrenamiento
    start = time() #Inicialización de la variable tiempo
    y_pred = model.predict(X_test) #Predicción de la variable dependiente
    
    cv = KFold(n_splits=5) #Metodo de validación cruzada

    #Evaluación del modelo segun el parámetro R^2
    scores = cross_val_score(model, X_train, Y_train, scoring='r2', cv=cv, n_jobs=-1)
    #Evaluación del modelo segun el parámetro RMSE
    scores_r = cross_val_score(model, X_train, Y_train, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)

    predict_time = time()-start #Tiempo de predicción 
    print(model) #Se imprime el nombre del modelo
    print("\tTraining time: %0.3fs" % train_time) #Se imprime el tiempo de entrenamiento
    print("\tPrediction time: %0.3fs" % predict_time) #Se imprime el tiempo de predicción
    print("\tRMSE:", np.sqrt(mean_squared_error(Y_test, y_pred))) #Se imprime el resultado de RMSE
    print("\tR2 score:", r2_score(Y_test, y_pred)) #Se imprime el resultado de R^2
    print("\tRMSE cv:", np.mean(-1*scores_r)) #Se imprime el resultado de RMSE con validación cruzada
    print("\tR2 score cv:", np.mean(scores)) #Se imprime el resultado de R^2 con validación cruzada
    print()

LinearRegression()
	Training time: 0.088s
	Prediction time: 4.585s
	RMSE: 0.013631036285613305
	R2 score: 0.3197823865579503
	RMSE cv: 0.014300584321755138
	R2 score cv: 0.21841700728546337

DecisionTreeRegressor(max_depth=7, random_state=329)
	Training time: 0.009s
	Prediction time: 0.196s
	RMSE: 0.012042390381554295
	R2 score: 0.46909657265987215
	RMSE cv: 0.01174975482202759
	R2 score cv: 0.45071486695434865

KNeighborsRegressor()
	Training time: 0.005s
	Prediction time: 0.062s
	RMSE: 0.01909837924162562
	R2 score: -0.33531322872710745
	RMSE cv: 0.017598878855123606
	R2 score cv: -0.18521806001308308

SVR()
	Training time: 0.003s
	Prediction time: 0.046s
	RMSE: 0.030904503737867302
	R2 score: -2.496501136639113
	RMSE cv: 0.031303076940417715
	R2 score cv: -3.0876696039395606

RandomForestRegressor(random_state=329)
	Training time: 0.177s
	Prediction time: 0.707s
	RMSE: 0.008733419340594899
	R2 score: 0.720772136929016
	RMSE cv: 0.009825651920312125
	R2 score cv: 0.6226597652481745



In [7]:
# División de los datos en train 80% y test 20%
# ==============================================================================
X_train, X_test, Y_train, Y_test = train_test_split(
                                        df.drop(columns = "Incidencia"),
                                        df['Incidencia'],
                                        test_size = 0.2 ,
                                        random_state = 117
                                    )

In [8]:

#Se crea un ciclo for para evaluar cada algoritmo
for model in regressors[:head]:
    start = time() #Inicialización de la variable tiempo
    model.fit(X_train, Y_train) #Entrenamiento
    train_time = time() - start #Tiempo de entrenamiento
    start = time() #Inicialización de la variable tiempo
    y_pred = model.predict(X_test) #Predicción de la variable dependiente
    
    cv = KFold(n_splits=5) #Metodo de validación cruzada

    #Evaluación del modelo segun el parámetro R^2
    scores = cross_val_score(model, X_train, Y_train, scoring='r2', cv=cv, n_jobs=-1)
    #Evaluación del modelo segun el parámetro RMSE
    scores_r = cross_val_score(model, X_train, Y_train, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)

    predict_time = time()-start #Tiempo de predicción 
    print(model) #Se imprime el nombre del modelo
    print("\tTraining time: %0.3fs" % train_time) #Se imprime el tiempo de entrenamiento
    print("\tPrediction time: %0.3fs" % predict_time) #Se imprime el tiempo de predicción
    print("\tRMSE:", np.sqrt(mean_squared_error(Y_test, y_pred))) #Se imprime el resultado de RMSE
    print("\tR2 score:", r2_score(Y_test, y_pred)) #Se imprime el resultado de R^2
    print("\tRMSE cv:", np.mean(-1*scores_r)) #Se imprime el resultado de RMSE con validación cruzada
    print("\tR2 score cv:", np.mean(scores)) #Se imprime el resultado de R^2 con validación cruzada
    print()

LinearRegression()
	Training time: 0.006s
	Prediction time: 0.050s
	RMSE: 0.013866120614674165
	R2 score: 0.3582447293871337
	RMSE cv: 0.013994777809401313
	R2 score cv: 0.2456490500613726

DecisionTreeRegressor(max_depth=7, random_state=329)
	Training time: 0.003s
	Prediction time: 0.050s
	RMSE: 0.01078957648790785
	R2 score: 0.6114308716255079
	RMSE cv: 0.014407183325880927
	R2 score cv: 0.1667414855741706

KNeighborsRegressor()
	Training time: 0.005s
	Prediction time: 0.052s
	RMSE: 0.019019737215518116
	R2 score: -0.2074482362169363
	RMSE cv: 0.01831491375471123
	R2 score cv: -0.2819405149183184

SVR()
	Training time: 0.003s
	Prediction time: 0.057s
	RMSE: 0.03328518820718469
	R2 score: -2.697954224219229
	RMSE cv: 0.030932950732732445
	R2 score cv: -2.707580375328355

RandomForestRegressor(random_state=329)
	Training time: 0.201s
	Prediction time: 0.601s
	RMSE: 0.007915229617231359
	R2 score: 0.7908843514969324
	RMSE cv: 0.01016205203692043
	R2 score cv: 0.6061352657787028

Gradien

# Prueba DANE

In [9]:
#Se crea un dataframe con las columnas relevantes para el dataset del DANE
dataset = df.loc[:,['CodDepartamento','Ano','Trimestre','Vulnerabilidad_numero','Incidencia']]
dataset

Unnamed: 0,CodDepartamento,Ano,Trimestre,Vulnerabilidad_numero,Incidencia
0,5,2020,1,2.0,0.000024
1,5,2020,2,2.0,0.000825
2,5,2020,3,2.0,0.026520
3,5,2020,4,2.0,0.033072
4,5,2021,1,2.0,0.024737
...,...,...,...,...,...
187,88,2020,4,1.0,0.014034
188,88,2021,1,1.0,0.005350
189,88,2021,2,1.0,0.061390
190,88,2021,3,1.0,0.006413


In [10]:
#Se separan las columnas para la evaluación
y = dataset["Incidencia"] #Variable dependiente
X = dataset.iloc[:,:-1] #Variables independientes
X

Unnamed: 0,CodDepartamento,Ano,Trimestre,Vulnerabilidad_numero
0,5,2020,1,2.0
1,5,2020,2,2.0
2,5,2020,3,2.0
3,5,2020,4,2.0
4,5,2021,1,2.0
...,...,...,...,...
187,88,2020,4,1.0
188,88,2021,1,1.0
189,88,2021,2,1.0
190,88,2021,3,1.0


In [11]:
# División de los datos en train 80% y test 20%
# ==============================================================================
X_train, X_test, Y_train, Y_test = train_test_split(X, y,
                                                    test_size = 0.2,
                                                    random_state = 117)

In [12]:
#Se crea un ciclo for para evaluar cada algoritmo
for model in regressors[:head]:
    start = time() #Inicialización de la variable tiempo
    model.fit(X_train, Y_train) #Entrenamiento
    train_time = time() - start #Tiempo de entrenamiento
    start = time() #Inicialización de la variable tiempo
    y_pred = model.predict(X_test) #Predicción de la variable dependiente
    
    cv = KFold(n_splits=5) #Metodo de validación cruzada

    #Evaluación del modelo segun el parámetro R^2
    scores = cross_val_score(model, X_train, Y_train, scoring='r2', cv=cv, n_jobs=-1)
    #Evaluación del modelo segun el parámetro RMSE
    scores_r = cross_val_score(model, X_train, Y_train, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)

    predict_time = time()-start #Tiempo de predicción 
    print(model) #Se imprime el nombre del modelo
    print("\tTraining time: %0.3fs" % train_time) #Se imprime el tiempo de entrenamiento
    print("\tPrediction time: %0.3fs" % predict_time) #Se imprime el tiempo de predicción
    print("\tRMSE:", np.sqrt(mean_squared_error(Y_test, y_pred))) #Se imprime el resultado de RMSE
    print("\tR2 score:", r2_score(Y_test, y_pred)) #Se imprime el resultado de R^2
    print("\tRMSE cv:", np.mean(-1*scores_r)) #Se imprime el resultado de RMSE con validación cruzada
    print("\tR2 score cv:", np.mean(scores)) #Se imprime el resultado de R^2 con validación cruzada
    print()

LinearRegression()
	Training time: 0.005s
	Prediction time: 0.046s
	RMSE: 0.016508335348110314
	R2 score: 0.09036730481071598
	RMSE cv: 0.016161611873050884
	R2 score cv: 0.0132045928382859

DecisionTreeRegressor(max_depth=7, random_state=329)
	Training time: 0.004s
	Prediction time: 0.046s
	RMSE: 0.012028746431194103
	R2 score: 0.5170521501771455
	RMSE cv: 0.013482880504951814
	R2 score cv: 0.24031147526214633

KNeighborsRegressor()
	Training time: 0.003s
	Prediction time: 0.050s
	RMSE: 0.019642517666760933
	R2 score: -0.28781595633420465
	RMSE cv: 0.018145464233140063
	R2 score cv: -0.2562989613217138

SVR()
	Training time: 0.003s
	Prediction time: 0.045s
	RMSE: 0.03328518820718469
	R2 score: -2.697954224219229
	RMSE cv: 0.030932950732732445
	R2 score cv: -2.707580375328355

RandomForestRegressor(random_state=329)
	Training time: 0.158s
	Prediction time: 0.511s
	RMSE: 0.010823434475625539
	R2 score: 0.6089883640812234
	RMSE cv: 0.010779590232006715
	R2 score cv: 0.5605160421927209

G

In [13]:
# División de los datos en train 70% y test 20%
# ==============================================================================
X_train, X_test, Y_train, Y_test = train_test_split(X, y,
                                                    test_size = 0.3,
                                                    random_state = 117)

In [14]:
#Se crea un ciclo for para evaluar cada algoritmo
for model in regressors[:head]:
    start = time() #Inicialización de la variable tiempo
    model.fit(X_train, Y_train) #Entrenamiento
    train_time = time() - start #Tiempo de entrenamiento
    start = time() #Inicialización de la variable tiempo
    y_pred = model.predict(X_test) #Predicción de la variable dependiente
    
    cv = KFold(n_splits=5) #Metodo de validación cruzada

    #Evaluación del modelo segun el parámetro R^2
    scores = cross_val_score(model, X_train, Y_train, scoring='r2', cv=cv, n_jobs=-1)
    #Evaluación del modelo segun el parámetro RMSE
    scores_r = cross_val_score(model, X_train, Y_train, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)

    predict_time = time()-start #Tiempo de predicción 
    print(model) #Se imprime el nombre del modelo
    print("\tTraining time: %0.3fs" % train_time) #Se imprime el tiempo de entrenamiento
    print("\tPrediction time: %0.3fs" % predict_time) #Se imprime el tiempo de predicción
    print("\tRMSE:", np.sqrt(mean_squared_error(Y_test, y_pred))) #Se imprime el resultado de RMSE
    print("\tR2 score:", r2_score(Y_test, y_pred)) #Se imprime el resultado de R^2
    print("\tRMSE cv:", np.mean(-1*scores_r)) #Se imprime el resultado de RMSE con validación cruzada
    print("\tR2 score cv:", np.mean(scores)) #Se imprime el resultado de R^2 con validación cruzada
    print()

LinearRegression()
	Training time: 0.004s
	Prediction time: 0.047s
	RMSE: 0.01625213009680334
	R2 score: 0.033035108941913305
	RMSE cv: 0.016081121367950823
	R2 score cv: 0.04111941694703296

DecisionTreeRegressor(max_depth=7, random_state=329)
	Training time: 0.004s
	Prediction time: 0.046s
	RMSE: 0.011979489426997234
	R2 score: 0.4746282183018782
	RMSE cv: 0.011927627934692342
	R2 score cv: 0.40450864618015264

KNeighborsRegressor()
	Training time: 0.002s
	Prediction time: 0.047s
	RMSE: 0.01825917166063499
	R2 score: -0.2205407078049726
	RMSE cv: 0.017636664446070777
	R2 score cv: -0.17305704505181926

SVR()
	Training time: 0.003s
	Prediction time: 0.044s
	RMSE: 0.030904503737867302
	R2 score: -2.496501136639113
	RMSE cv: 0.031303076940417715
	R2 score cv: -3.0876696039395606

RandomForestRegressor(random_state=329)
	Training time: 0.150s
	Prediction time: 0.483s
	RMSE: 0.01050489128955867
	R2 score: 0.5960075485811183
	RMSE cv: 0.009897926165008597
	R2 score cv: 0.622134277134781

G