In [38]:
import pandas as pd
import numpy as np
from pycaret.regression import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
import joblib 

In [39]:
DfTrain = pd.read_csv('Caida_train.csv', encoding = 'utf-8')
DfTest = pd.read_csv('Caida_test.csv', encoding = 'utf-8')
Dfevalua = pd.read_csv('Caida_evalua.csv', encoding = 'utf-8')

In [40]:
DfTest.head()

Unnamed: 0,FECHA_ACCIDENTE,Total
0,2018-01-01,7
1,2018-01-02,11
2,2018-01-03,4
3,2018-01-04,12
4,2018-01-05,11


In [41]:
df_festivos = pd.read_csv('festivosm.csv', encoding = 'utf-8', sep =';')

In [42]:
df_festivos['Fecha'] = pd.to_datetime(df_festivos['Fecha'], format="%m/%d/%Y")

In [43]:
DfTrain['FECHA_ACCIDENTE'] = pd.to_datetime(DfTrain['FECHA_ACCIDENTE'], format="%Y/%m/%d")
DfTest['FECHA_ACCIDENTE'] = pd.to_datetime(DfTest['FECHA_ACCIDENTE'], format="%Y/%m/%d")
Dfevalua['FECHA_ACCIDENTE'] = pd.to_datetime(Dfevalua['FECHA_ACCIDENTE'], format="%Y/%m/%d")

In [44]:
DfTrain['festivo'] = DfTrain['FECHA_ACCIDENTE'].apply(lambda x: 1 if x in df_festivos['Fecha'].unique() else 0)
DfTest['festivo'] = DfTest['FECHA_ACCIDENTE'].apply(lambda x: 1 if x in df_festivos['Fecha'].unique() else 0)
Dfevalua['festivo'] = Dfevalua['FECHA_ACCIDENTE'].apply(lambda x: 1 if x in df_festivos['Fecha'].unique() else 0)

In [45]:
DfTrain['Year'] = DfTrain['FECHA_ACCIDENTE'].dt.year
DfTrain['Month'] = DfTrain['FECHA_ACCIDENTE'].dt.month
DfTrain['DayMo'] = DfTrain['FECHA_ACCIDENTE'].dt.day
DfTrain['Dayw'] = DfTrain['FECHA_ACCIDENTE'].apply(lambda x: x.strftime('%A'))

In [46]:
DfTest['Year'] = DfTest['FECHA_ACCIDENTE'].dt.year
DfTest['Month'] = DfTest['FECHA_ACCIDENTE'].dt.month
DfTest['DayMo'] = DfTest['FECHA_ACCIDENTE'].dt.day
DfTest['Dayw'] = DfTest['FECHA_ACCIDENTE'].apply(lambda x: x.strftime('%A'))

In [47]:
Dfevalua['Year'] = Dfevalua['FECHA_ACCIDENTE'].dt.year
Dfevalua['Month'] = Dfevalua['FECHA_ACCIDENTE'].dt.month
Dfevalua['DayMo'] = Dfevalua['FECHA_ACCIDENTE'].dt.day
Dfevalua['Dayw'] = Dfevalua['FECHA_ACCIDENTE'].apply(lambda x: x.strftime('%A'))

In [48]:
DfTest.head()

Unnamed: 0,FECHA_ACCIDENTE,Total,festivo,Year,Month,DayMo,Dayw
0,2018-01-01,7,1,2018,1,1,Monday
1,2018-01-02,11,0,2018,1,2,Tuesday
2,2018-01-03,4,0,2018,1,3,Wednesday
3,2018-01-04,12,0,2018,1,4,Thursday
4,2018-01-05,11,0,2018,1,5,Friday


In [49]:
def quincena(f):
    z = []
    for i,j in zip(f['DayMo'],f['Dayw']):
        if (i in [15,30,31] and j in ['Monday','Tuesday', 'Thursday', 'Friday','Wednesday']):
            z.append(1)
        else:
            z.append(0)
    return z

In [50]:
DfTrain['Quincena'] = quincena(DfTrain)
DfTest['Quincena'] = quincena(DfTest)
Dfevalua['Quincena'] = quincena(Dfevalua)

In [51]:
variab = ['DayMo','Month','Year','festivo','Dayw','Total','Quincena']

In [52]:
DfTrain = DfTrain[variab]
DfTest = DfTest[variab]
Dfevalua = Dfevalua[variab]

In [53]:
Dfevalua.head()

Unnamed: 0,DayMo,Month,Year,festivo,Dayw,Total,Quincena
0,1,1,2020,1,Wednesday,8,0
1,2,1,2020,0,Thursday,14,0
2,3,1,2020,0,Friday,12,0
3,4,1,2020,0,Saturday,7,0
4,5,1,2020,0,Sunday,7,0


## One hot encoding y Modelo

In [54]:
DfTrain = pd.get_dummies(DfTrain)
DfTest = pd.get_dummies(DfTest)
Dfevalua = pd.get_dummies(Dfevalua)

In [55]:
num = list(DfTrain.columns.values)
num.remove('Total')

In [56]:
exp_reg101 = setup(data = DfTrain, target = 'Total',
                   numeric_features = num , normalize = True, session_id = 8301)

Unnamed: 0,Description,Value
0,session_id,8301
1,Target,Total
2,Original Data,"(1275, 13)"
3,Missing Values,False
4,Numeric Features,12
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(892, 12)"


In [57]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
br,Bayesian Ridge,3.0764,15.3859,3.911,0.1022,0.3836,0.4173,0.015
ridge,Ridge Regression,3.0769,15.3953,3.9128,0.1008,0.3839,0.4162,0.016
lar,Least Angle Regression,3.0769,15.3958,3.9128,0.1008,0.3839,0.4162,0.02
lr,Linear Regression,3.0752,15.4234,3.9165,0.0991,0.3842,0.4161,1.459
huber,Huber Regressor,3.0749,15.516,3.9285,0.0934,0.3818,0.4063,0.015
gbr,Gradient Boosting Regressor,3.1156,15.9932,3.9909,0.0619,0.3876,0.4132,0.085
omp,Orthogonal Matching Pursuit,3.1904,16.3697,4.0291,0.0503,0.3955,0.4362,0.016
ada,AdaBoost Regressor,3.2066,16.4676,4.0423,0.0422,0.4031,0.4574,0.034
rf,Random Forest Regressor,3.1875,16.5832,4.0638,0.028,0.3961,0.4264,0.268
en,Elastic Net,3.2284,16.7819,4.0805,0.0259,0.4007,0.4435,0.014


BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None,
              compute_score=False, copy_X=True, fit_intercept=True,
              lambda_1=1e-06, lambda_2=1e-06, lambda_init=None, n_iter=300,
              normalize=False, tol=0.001, verbose=False)

In [58]:
rfj = create_model('rf')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.9151,14.0151,3.7437,0.0638,0.3459,0.3216
1,3.446,18.5931,4.312,0.1475,0.3956,0.4228
2,3.1509,15.7617,3.9701,0.0496,0.3986,0.4375
3,2.8563,14.9834,3.8708,-0.0431,0.3368,0.3152
4,3.1208,15.0096,3.8742,-0.0194,0.4121,0.4745
5,3.0289,15.781,3.9725,-0.0116,0.3817,0.3818
6,3.4416,20.8648,4.5678,0.1301,0.4103,0.423
7,3.2628,17.415,4.1731,0.051,0.4623,0.5613
8,3.088,14.3282,3.7853,0.0101,0.3803,0.422
9,3.5643,19.0795,4.368,-0.0976,0.4369,0.5045


In [59]:
tuned_rfj = tune_model(rfj)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.8156,12.5277,3.5394,0.1631,0.3299,0.3266
1,3.408,18.6841,4.3225,0.1433,0.3935,0.4171
2,3.0298,14.977,3.87,0.0969,0.3867,0.4252
3,2.7447,13.5821,3.6854,0.0544,0.3203,0.3073
4,2.9845,13.7349,3.7061,0.0671,0.3997,0.4696
5,2.88,13.8779,3.7253,0.1104,0.3573,0.3614
6,3.4866,21.2597,4.6108,0.1137,0.4137,0.4339
7,3.2186,16.2354,4.0293,0.1153,0.4476,0.5523
8,2.857,13.0007,3.6056,0.1018,0.3709,0.402
9,3.3485,16.7313,4.0904,0.0375,0.4194,0.4861


In [60]:
tuned_rfj

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=6, max_features='log2', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.005,
                      min_impurity_split=None, min_samples_leaf=2,
                      min_samples_split=10, min_weight_fraction_leaf=0.0,
                      n_estimators=190, n_jobs=-1, oob_score=False,
                      random_state=8301, verbose=0, warm_start=False)

In [61]:
predict_model(tuned_rfj)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Random Forest Regressor,3.3113,18.3278,4.2811,0.0707,0.3841,0.3937


Unnamed: 0,DayMo,Month,Year,festivo,Quincena,Dayw_Friday,Dayw_Monday,Dayw_Saturday,Dayw_Sunday,Dayw_Thursday,Dayw_Tuesday,Dayw_Wednesday,Total,Label
0,-0.558240,0.297406,0.281662,-0.225051,-0.253849,-0.399935,-0.413042,-0.407447,-0.420453,2.388884,-0.405575,-0.392359,18.0,11.825092
1,1.732239,0.004596,-0.680953,-0.225051,3.939355,2.500407,-0.413042,-0.407447,-0.420453,-0.418606,-0.405575,-0.392359,7.0,10.305963
2,-0.901812,-0.873836,0.281662,-0.225051,-0.253849,2.500407,-0.413042,-0.407447,-0.420453,-0.418606,-0.405575,-0.392359,12.0,9.926635
3,1.045095,1.468649,-0.680953,4.443443,-0.253849,2.500407,-0.413042,-0.407447,-0.420453,-0.418606,-0.405575,-0.392359,3.0,8.594306
4,1.045095,-1.459458,-0.680953,-0.225051,-0.253849,-0.399935,-0.413042,-0.407447,-0.420453,-0.418606,-0.405575,2.548686,7.0,10.541789
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,-0.672764,1.468649,1.244277,-0.225051,-0.253849,-0.399935,-0.413042,-0.407447,2.378385,-0.418606,-0.405575,-0.392359,5.0,7.448042
379,-0.787288,1.468649,-0.680953,-0.225051,-0.253849,-0.399935,-0.413042,-0.407447,-0.420453,-0.418606,-0.405575,2.548686,9.0,10.499437
380,-0.787288,1.175838,0.281662,-0.225051,-0.253849,-0.399935,-0.413042,-0.407447,-0.420453,-0.418606,-0.405575,2.548686,9.0,10.689385
381,-0.558240,-1.459458,1.244277,-0.225051,-0.253849,-0.399935,-0.413042,2.454307,-0.420453,-0.418606,-0.405575,-0.392359,9.0,8.736029


In [62]:
labeltr = DfTrain['Total']
labeltest = DfTest['Total']
DfTrain = DfTrain.drop('Total', axis = 1)
DfTest = DfTest.drop('Total', axis = 1)

In [63]:
forest = tuned_rfj.fit(DfTrain,labeltr)

## Rmse entrenamiento

In [64]:
ypredTrain = forest.predict(DfTrain)

In [65]:
(mse(ypredTrain,labeltr))**(1/2)

3.8117078678529572

## Rmse Test

In [66]:
ypredTest = forest.predict(DfTest)

In [67]:
(mse(ypredTest,labeltest))**(1/2)

4.119099809065017

## Rmse Evaluación

In [68]:
labelevalua = Dfevalua['Total']

In [69]:
Dfevalua = Dfevalua.drop(['Total'], axis = 1)

In [70]:
yevalua = forest.predict(Dfevalua)

In [71]:
(mse(yevalua,labelevalua))**(1/2)

5.294891245143799

## Se guarda modelo

In [72]:
joblib.dump(forest, 'modelo_Caida_entrenado.pkl') # Se guarda el modelo.

['modelo_Caida_entrenado.pkl']