In [58]:
import pandas as pd
import numpy as np
from pycaret.regression import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
import joblib 

In [59]:
DfTrain = pd.read_csv('Atropello_train.csv', encoding = 'utf-8')
DfTest = pd.read_csv('Atropello_test.csv', encoding = 'utf-8')
Dfevalua = pd.read_csv('Atropello_evalua.csv', encoding = 'utf-8')

In [60]:
DfTest.head()

Unnamed: 0,FECHA_ACCIDENTE,Total
0,2018-01-01,8
1,2018-01-02,6
2,2018-01-03,14
3,2018-01-04,8
4,2018-01-05,10


In [61]:
df_festivos = pd.read_csv('festivosm.csv', encoding = 'utf-8', sep =';')

In [62]:
df_festivos['Fecha'] = pd.to_datetime(df_festivos['Fecha'], format="%m/%d/%Y")

In [63]:
DfTrain['FECHA_ACCIDENTE'] = pd.to_datetime(DfTrain['FECHA_ACCIDENTE'], format="%Y/%m/%d")
DfTest['FECHA_ACCIDENTE'] = pd.to_datetime(DfTest['FECHA_ACCIDENTE'], format="%Y/%m/%d")
Dfevalua['FECHA_ACCIDENTE'] = pd.to_datetime(Dfevalua['FECHA_ACCIDENTE'], format="%Y/%m/%d")

In [64]:
DfTrain['festivo'] = DfTrain['FECHA_ACCIDENTE'].apply(lambda x: 1 if x in df_festivos['Fecha'].unique() else 0)
DfTest['festivo'] = DfTest['FECHA_ACCIDENTE'].apply(lambda x: 1 if x in df_festivos['Fecha'].unique() else 0)
Dfevalua['festivo'] = Dfevalua['FECHA_ACCIDENTE'].apply(lambda x: 1 if x in df_festivos['Fecha'].unique() else 0)

In [65]:
DfTrain['Year'] = DfTrain['FECHA_ACCIDENTE'].dt.year
DfTrain['Month'] = DfTrain['FECHA_ACCIDENTE'].dt.month
DfTrain['DayMo'] = DfTrain['FECHA_ACCIDENTE'].dt.day
DfTrain['Dayw'] = DfTrain['FECHA_ACCIDENTE'].apply(lambda x: x.strftime('%A'))

In [66]:
DfTest['Year'] = DfTest['FECHA_ACCIDENTE'].dt.year
DfTest['Month'] = DfTest['FECHA_ACCIDENTE'].dt.month
DfTest['DayMo'] = DfTest['FECHA_ACCIDENTE'].dt.day
DfTest['Dayw'] = DfTest['FECHA_ACCIDENTE'].apply(lambda x: x.strftime('%A'))

In [67]:
Dfevalua['Year'] = Dfevalua['FECHA_ACCIDENTE'].dt.year
Dfevalua['Month'] = Dfevalua['FECHA_ACCIDENTE'].dt.month
Dfevalua['DayMo'] = Dfevalua['FECHA_ACCIDENTE'].dt.day
Dfevalua['Dayw'] = Dfevalua['FECHA_ACCIDENTE'].apply(lambda x: x.strftime('%A'))

In [68]:
DfTest.head()

Unnamed: 0,FECHA_ACCIDENTE,Total,festivo,Year,Month,DayMo,Dayw
0,2018-01-01,8,1,2018,1,1,Monday
1,2018-01-02,6,0,2018,1,2,Tuesday
2,2018-01-03,14,0,2018,1,3,Wednesday
3,2018-01-04,8,0,2018,1,4,Thursday
4,2018-01-05,10,0,2018,1,5,Friday


In [69]:
def quincena(f):
    z = []
    for i,j in zip(f['DayMo'],f['Dayw']):
        if (i in [15,30,31] and j in ['Monday','Tuesday', 'Thursday', 'Friday','Wednesday']):
            z.append(1)
        else:
            z.append(0)
    return z

In [70]:
DfTrain['Quincena'] = quincena(DfTrain)
DfTest['Quincena'] = quincena(DfTest)
Dfevalua['Quincena'] = quincena(Dfevalua)

In [71]:
variab = ['DayMo','Month','Year','festivo','Dayw','Total','Quincena']

In [72]:
DfTrain = DfTrain[variab]
DfTest = DfTest[variab]
Dfevalua = Dfevalua[variab]

In [73]:
Dfevalua.head()

Unnamed: 0,DayMo,Month,Year,festivo,Dayw,Total,Quincena
0,1,1,2020,1,Wednesday,12,0
1,2,1,2020,0,Thursday,7,0
2,3,1,2020,0,Friday,12,0
3,4,1,2020,0,Saturday,5,0
4,5,1,2020,0,Sunday,10,0


## One hot encoding y Modelo

In [74]:
DfTrain = pd.get_dummies(DfTrain)
DfTest = pd.get_dummies(DfTest)
Dfevalua = pd.get_dummies(Dfevalua)

In [75]:
num = list(DfTrain.columns.values)
num.remove('Total')

In [76]:
exp_reg101 = setup(data = DfTrain, target = 'Total',
                   numeric_features = num , normalize = True, session_id = 8301)

Unnamed: 0,Description,Value
0,session_id,8301
1,Target,Total
2,Original Data,"(1275, 13)"
3,Missing Values,False
4,Numeric Features,12
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(892, 12)"


In [77]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
br,Bayesian Ridge,2.932,13.3499,3.6487,0.0894,0.3285,0.3351,0.016
ridge,Ridge Regression,2.9282,13.3577,3.6499,0.0881,0.3283,0.3339,0.016
lar,Least Angle Regression,2.9282,13.3581,3.65,0.0881,0.3283,0.3338,0.025
huber,Huber Regressor,2.9281,13.3795,3.6522,0.0873,0.3272,0.3306,0.015
lr,Linear Regression,2.9364,13.3946,3.655,0.0855,0.3286,0.3346,1.707
gbr,Gradient Boosting Regressor,2.9628,13.88,3.7186,0.0523,0.3362,0.3352,0.059
ada,AdaBoost Regressor,3.0388,13.9861,3.7346,0.0465,0.3403,0.3592,0.035
omp,Orthogonal Matching Pursuit,3.0181,14.0362,3.7406,0.0433,0.3368,0.3477,0.014
en,Elastic Net,3.0587,14.3126,3.7765,0.0276,0.3402,0.3534,0.022
lasso,Lasso Regression,3.1207,14.8885,3.8509,-0.0104,0.3459,0.3602,0.023


BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None,
              compute_score=False, copy_X=True, fit_intercept=True,
              lambda_1=1e-06, lambda_2=1e-06, lambda_init=None, n_iter=300,
              normalize=False, tol=0.001, verbose=False)

In [78]:
rfj = create_model('rf')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.9662,13.9645,3.7369,0.0453,0.3097,0.2954
1,2.9721,12.3998,3.5213,0.0582,0.3298,0.3456
2,3.1524,15.0915,3.8848,-0.0753,0.3375,0.3471
3,3.2102,15.4655,3.9326,-0.1153,0.3745,0.4046
4,2.8027,13.1933,3.6323,-0.1652,0.3355,0.324
5,2.6926,12.1319,3.4831,0.174,0.3261,0.3211
6,3.1181,16.7058,4.0873,0.0148,0.3678,0.392
7,3.2399,16.7254,4.0897,0.0215,0.3619,0.3579
8,3.1703,16.5002,4.062,0.0555,0.322,0.302
9,3.3063,16.5939,4.0736,-0.1735,0.3695,0.3663


In [79]:
tuned_rfj = tune_model(rfj)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.859,12.7814,3.5751,0.1262,0.3047,0.3005
1,2.7754,10.9981,3.3163,0.1646,0.3202,0.3352
2,2.9843,13.5652,3.6831,0.0335,0.321,0.3231
3,2.9925,13.3202,3.6497,0.0394,0.3564,0.3845
4,2.7748,11.9262,3.4534,-0.0533,0.325,0.3307
5,2.7978,12.4685,3.5311,0.1511,0.3217,0.3197
6,2.9844,15.5164,3.9391,0.0849,0.3549,0.3748
7,3.1795,14.7003,3.8341,0.14,0.3413,0.3583
8,3.129,15.9328,3.9916,0.0879,0.3206,0.3078
9,3.0071,13.6306,3.692,0.0361,0.3339,0.3397


In [80]:
tuned_rfj

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=6, max_features='log2', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.005,
                      min_impurity_split=None, min_samples_leaf=2,
                      min_samples_split=10, min_weight_fraction_leaf=0.0,
                      n_estimators=190, n_jobs=-1, oob_score=False,
                      random_state=8301, verbose=0, warm_start=False)

In [81]:
predict_model(tuned_rfj)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Random Forest Regressor,2.9253,13.7597,3.7094,0.1305,0.3294,0.3313


Unnamed: 0,DayMo,Month,Year,festivo,Quincena,Dayw_Friday,Dayw_Monday,Dayw_Saturday,Dayw_Sunday,Dayw_Thursday,Dayw_Tuesday,Dayw_Wednesday,Total,Label
0,-0.662421,0.298207,0.282680,-0.219496,-0.251337,-0.390455,-0.403699,-0.422298,-0.416754,-0.401819,-0.388546,2.307865,15.0,11.417809
1,1.627810,0.005255,-0.679728,-0.219496,3.978717,-0.390455,-0.403699,-0.422298,-0.416754,2.488684,-0.388546,-0.433301,10.0,11.399860
2,-1.005956,-0.873602,0.282680,-0.219496,-0.251337,-0.390455,-0.403699,-0.422298,-0.416754,2.488684,-0.388546,-0.433301,10.0,11.388690
3,0.940740,1.470017,-0.679728,-0.219496,-0.251337,-0.390455,-0.403699,-0.422298,-0.416754,2.488684,-0.388546,-0.433301,18.0,11.381095
4,1.055252,-1.459507,-0.679728,-0.219496,-0.251337,-0.390455,-0.403699,-0.422298,-0.416754,-0.401819,-0.388546,2.307865,20.0,11.625775
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,-0.662421,1.470017,1.245088,-0.219496,-0.251337,-0.390455,-0.403699,-0.422298,2.399495,-0.401819,-0.388546,-0.433301,3.0,10.224995
379,-0.891444,1.470017,-0.679728,4.555886,-0.251337,-0.390455,-0.403699,-0.422298,-0.416754,-0.401819,2.573700,-0.433301,14.0,9.442640
380,-0.891444,1.177064,0.282680,-0.219496,-0.251337,-0.390455,-0.403699,-0.422298,-0.416754,-0.401819,2.573700,-0.433301,10.0,11.277705
381,-0.662421,-1.459507,1.245088,-0.219496,-0.251337,2.561117,-0.403699,-0.422298,-0.416754,-0.401819,-0.388546,-0.433301,15.0,10.526226


In [82]:
labeltr = DfTrain['Total']
labeltest = DfTest['Total']
DfTrain = DfTrain.drop('Total', axis = 1)
DfTest = DfTest.drop('Total', axis = 1)

In [83]:
forest = tuned_rfj.fit(DfTrain,labeltr)

## Rmse entrenamiento

In [84]:
ypredTrain = forest.predict(DfTrain)

In [85]:
(mse(ypredTrain,labeltr))**(1/2)

3.4613361333844774

## Rmse Test

In [86]:
ypredTest = forest.predict(DfTest)

In [87]:
(mse(ypredTest,labeltest))**(1/2)

3.4445540593756245

## Rmse Evaluación

In [88]:
labelevalua = Dfevalua['Total']

In [89]:
Dfevalua = Dfevalua.drop(['Total'], axis = 1)

In [90]:
yevalua = forest.predict(Dfevalua)

In [91]:
(mse(yevalua,labelevalua))**(1/2)

5.265937402314841

## Se guarda modelo

In [92]:
joblib.dump(forest, 'modelo_Atropello_entrenado.pkl') # Se guarda el modelo.

['modelo_Atropello_entrenado.pkl']