In [1]:
import pandas as pd
import numpy as np

In [2]:
from pycaret.regression import *

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse

In [4]:
import joblib  

In [5]:
ChoqueTrain = pd.read_csv('Choque_train.csv', encoding = 'utf-8')
ChoqueTest = pd.read_csv('Choque_test.csv', encoding = 'utf-8')
Choqueevalua = pd.read_csv('Choque_evalua.csv', encoding = 'utf-8')

In [6]:
Choqueevalua.head()

Unnamed: 0,FECHA_ACCIDENTE,Total
0,2020-01-01,39
1,2020-01-02,51
2,2020-01-03,71
3,2020-01-04,59
4,2020-01-05,46


In [7]:
df_festivos = pd.read_csv('festivosm.csv', encoding = 'utf-8', sep =';')

In [8]:
df_festivos['Fecha'] = pd.to_datetime(df_festivos['Fecha'], format="%m/%d/%Y")

In [9]:
ChoqueTrain['FECHA_ACCIDENTE'] = pd.to_datetime(ChoqueTrain['FECHA_ACCIDENTE'], format="%Y/%m/%d")
ChoqueTest['FECHA_ACCIDENTE'] = pd.to_datetime(ChoqueTest['FECHA_ACCIDENTE'], format="%Y/%m/%d")
Choqueevalua['FECHA_ACCIDENTE'] = pd.to_datetime(Choqueevalua['FECHA_ACCIDENTE'], format="%Y/%m/%d")

In [10]:
ChoqueTrain['festivo'] = ChoqueTrain['FECHA_ACCIDENTE'].apply(lambda x: 1 if x in df_festivos['Fecha'].unique() else 0)
ChoqueTest['festivo'] = ChoqueTest['FECHA_ACCIDENTE'].apply(lambda x: 1 if x in df_festivos['Fecha'].unique() else 0)
Choqueevalua['festivo'] = Choqueevalua['FECHA_ACCIDENTE'].apply(lambda x: 1 if x in df_festivos['Fecha'].unique() else 0)

In [11]:
ChoqueTrain['Year'] = ChoqueTrain['FECHA_ACCIDENTE'].dt.year
ChoqueTrain['Month'] = ChoqueTrain['FECHA_ACCIDENTE'].dt.month
ChoqueTrain['DayMo'] = ChoqueTrain['FECHA_ACCIDENTE'].dt.day
ChoqueTrain['Dayw'] = ChoqueTrain['FECHA_ACCIDENTE'].apply(lambda x: x.strftime('%A'))

In [12]:
ChoqueTest['Year'] = ChoqueTest['FECHA_ACCIDENTE'].dt.year
ChoqueTest['Month'] = ChoqueTest['FECHA_ACCIDENTE'].dt.month
ChoqueTest['DayMo'] = ChoqueTest['FECHA_ACCIDENTE'].dt.day
ChoqueTest['Dayw'] = ChoqueTest['FECHA_ACCIDENTE'].apply(lambda x: x.strftime('%A'))

In [13]:
Choqueevalua['Year'] = Choqueevalua['FECHA_ACCIDENTE'].dt.year
Choqueevalua['Month'] = Choqueevalua['FECHA_ACCIDENTE'].dt.month
Choqueevalua['DayMo'] = Choqueevalua['FECHA_ACCIDENTE'].dt.day
Choqueevalua['Dayw'] = Choqueevalua['FECHA_ACCIDENTE'].apply(lambda x: x.strftime('%A'))

In [14]:
Choqueevalua.head()

Unnamed: 0,FECHA_ACCIDENTE,Total,festivo,Year,Month,DayMo,Dayw
0,2020-01-01,39,1,2020,1,1,Wednesday
1,2020-01-02,51,0,2020,1,2,Thursday
2,2020-01-03,71,0,2020,1,3,Friday
3,2020-01-04,59,0,2020,1,4,Saturday
4,2020-01-05,46,0,2020,1,5,Sunday


In [15]:
def quincena(f):
    z = []
    for i,j in zip(f['DayMo'],f['Dayw']):
        if (i in [15,30,31] and j in ['Monday','Tuesday', 'Thursday', 'Friday','Wednesday']):
            z.append(1)
        else:
            z.append(0)
    return z

In [16]:
ChoqueTrain['Quincena'] = quincena(ChoqueTrain)
ChoqueTest['Quincena'] = quincena(ChoqueTest)
Choqueevalua['Quincena'] = quincena(Choqueevalua)

In [17]:
variab = ['DayMo','Month','Year','festivo','Dayw','Total','Quincena']

In [18]:
ChoqueTrain = ChoqueTrain[variab]
ChoqueTest = ChoqueTest[variab]
Choqueevalua = Choqueevalua[variab]

In [19]:
Choqueevalua.head()

Unnamed: 0,DayMo,Month,Year,festivo,Dayw,Total,Quincena
0,1,1,2020,1,Wednesday,39,0
1,2,1,2020,0,Thursday,51,0
2,3,1,2020,0,Friday,71,0
3,4,1,2020,0,Saturday,59,0
4,5,1,2020,0,Sunday,46,0


# One hot encoding

In [20]:
ChoqueTrain = pd.get_dummies(ChoqueTrain)
ChoqueTest = pd.get_dummies(ChoqueTest)
Choqueevalua = pd.get_dummies(Choqueevalua)

In [21]:
num = list(ChoqueTrain.columns.values)
num.remove('Total')

In [22]:
exp_reg101 = setup(data = ChoqueTrain, target = 'Total',
                   numeric_features = num , normalize = True, session_id = 8301)

Unnamed: 0,Description,Value
0,session_id,8301
1,Target,Total
2,Original Data,"(1277, 13)"
3,Missing Values,False
4,Numeric Features,12
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(893, 12)"


In [23]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,9.5048,146.9733,12.0825,0.6781,0.1714,0.1358,0.067
catboost,CatBoost Regressor,9.5532,147.0877,12.1004,0.6755,0.1685,0.1354,0.558
lightgbm,Light Gradient Boosting Machine,9.6873,150.0586,12.2253,0.6679,0.1675,0.1364,0.688
rf,Random Forest Regressor,9.6694,152.6163,12.3083,0.6641,0.1705,0.1367,0.263
et,Extra Trees Regressor,9.7865,157.9228,12.5235,0.6536,0.1767,0.1391,0.245
ridge,Ridge Regression,10.1609,171.5932,13.0525,0.6285,0.2347,0.15,0.019
lar,Least Angle Regression,10.1602,171.5981,13.0526,0.6285,0.2346,0.15,0.025
br,Bayesian Ridge,10.1655,171.5902,13.0528,0.6285,0.2371,0.1502,0.02
huber,Huber Regressor,10.1196,171.8472,13.0544,0.6283,0.221,0.1496,0.017
lr,Linear Regression,10.1785,172.9415,13.1001,0.6256,0.2303,0.1504,1.552


GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=8301, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [24]:
rfj = create_model('rf')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,9.5159,147.0853,12.1279,0.6976,0.1613,0.1281
1,11.2729,221.5942,14.886,0.4644,0.1972,0.1507
2,8.9489,131.1445,11.4518,0.7408,0.1693,0.1367
3,9.7875,154.1344,12.4151,0.6892,0.1615,0.1325
4,10.8826,171.7288,13.1045,0.6494,0.1642,0.1408
5,8.9533,125.292,11.1934,0.7546,0.1559,0.1293
6,9.2549,160.5808,12.672,0.6584,0.195,0.1449
7,9.3947,154.496,12.4296,0.7077,0.2004,0.1571
8,9.1457,124.0891,11.1395,0.5814,0.1521,0.1207
9,9.5378,136.0174,11.6626,0.6977,0.148,0.1263


In [25]:
tuned_rfj = tune_model(rfj)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,9.3782,133.8598,11.5698,0.7248,0.1561,0.1288
1,9.8975,157.0753,12.533,0.6204,0.1684,0.1358
2,9.7812,149.9653,12.246,0.7036,0.1773,0.1453
3,9.7209,145.6558,12.0688,0.7063,0.165,0.1381
4,11.2096,175.9882,13.2661,0.6408,0.1698,0.1494
5,10.273,170.0783,13.0414,0.6669,0.2021,0.1639
6,9.9312,184.1046,13.5685,0.6084,0.2139,0.1636
7,10.2576,193.038,13.8938,0.6348,0.2332,0.183
8,8.9505,125.9216,11.2215,0.5753,0.1518,0.118
9,9.958,143.3731,11.9739,0.6814,0.1608,0.1375


In [26]:
tuned_rfj

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                      max_depth=9, max_features='log2', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.4,
                      min_impurity_split=None, min_samples_leaf=2,
                      min_samples_split=7, min_weight_fraction_leaf=0.0,
                      n_estimators=80, n_jobs=-1, oob_score=False,
                      random_state=8301, verbose=0, warm_start=False)

In [27]:
predict_model(tuned_rfj)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Random Forest Regressor,9.9656,161.1876,12.696,0.6321,0.1819,0.1465


Unnamed: 0,DayMo,Month,Year,festivo,Quincena,Dayw_Friday,Dayw_Monday,Dayw_Saturday,Dayw_Sunday,Dayw_Thursday,Dayw_Tuesday,Dayw_Wednesday,Total,Label
0,-0.655115,0.285810,1.236388,-0.213699,-0.227653,-0.41648,-0.397789,-0.434835,-0.422019,2.550333,-0.412771,-0.38062,87.0,87.288566
1,1.188913,-1.779622,0.273795,-0.213699,-0.227653,-0.41648,-0.397789,-0.434835,-0.422019,-0.392106,2.422650,-0.38062,83.0,83.771580
2,-1.116122,1.170995,0.273795,-0.213699,-0.227653,-0.41648,-0.397789,-0.434835,2.369560,-0.392106,-0.412771,-0.38062,49.0,46.221061
3,0.036395,-0.304313,-0.688798,-0.213699,-0.227653,-0.41648,-0.397789,-0.434835,-0.422019,-0.392106,2.422650,-0.38062,98.0,89.472354
4,-0.194108,-1.779622,1.236388,-0.213699,-0.227653,-0.41648,-0.397789,2.299724,-0.422019,-0.392106,-0.412771,-0.38062,80.0,74.374451
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,-0.309360,0.580872,1.236388,-0.213699,-0.227653,-0.41648,-0.397789,-0.434835,-0.422019,-0.392106,-0.412771,2.62729,81.0,87.524023
380,0.036395,0.285810,0.273795,-0.213699,-0.227653,-0.41648,-0.397789,-0.434835,-0.422019,-0.392106,2.422650,-0.38062,97.0,89.490966
381,1.534668,-0.304313,0.273795,-0.213699,-0.227653,-0.41648,-0.397789,-0.434835,-0.422019,-0.392106,-0.412771,2.62729,75.0,86.036221
382,-0.885619,1.466057,-0.688798,4.679470,-0.227653,-0.41648,-0.397789,-0.434835,-0.422019,-0.392106,2.422650,-0.38062,66.0,59.994366


In [28]:
labeltr = ChoqueTrain['Total']
labeltest = ChoqueTest['Total']
ChoqueTrain = ChoqueTrain.drop('Total', axis = 1)
ChoqueTest = ChoqueTest.drop('Total', axis = 1)

In [29]:
forest = tuned_rfj.fit(ChoqueTrain,labeltr)

## Rmse para el cojunto de entrenamiento.

In [30]:
ypredTrain = forest.predict(ChoqueTrain)

In [31]:
(mse(ypredTrain,labeltr))**(1/2)

11.92663384614956

## Rmse para el test.

In [32]:
ypredTest = forest.predict(ChoqueTest)

In [33]:
(mse(ypredTest,labeltest))**(1/2)

13.289767138439892

## Modelo en evaluación

In [34]:
labelevalua = Choqueevalua['Total']

In [35]:
Choqueevalua = Choqueevalua.drop(['Total'], axis = 1)

In [36]:
yevalua = forest.predict(Choqueevalua)

In [37]:
(mse(yevalua,labelevalua))**(1/2)

40.872946356819604

## Se guarda el modelo

In [38]:
joblib.dump(forest, 'modelo_choque_entrenado.pkl') # Se guarda el modelo.