In [36]:
import pandas as pd
import numpy as np
from pycaret.regression import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
import joblib 

In [37]:
DfTrain = pd.read_csv('Volcamiento_train.csv', encoding = 'utf-8')
DfTest = pd.read_csv('Volcamiento_test.csv', encoding = 'utf-8')
Dfevalua = pd.read_csv('Volcamiento_evalua.csv', encoding = 'utf-8')

In [38]:
DfTest.head()

Unnamed: 0,FECHA_ACCIDENTE,Total
0,2018-01-01,1
1,2018-01-02,6
2,2018-01-03,4
3,2018-01-04,2
4,2018-01-05,1


In [39]:
df_festivos = pd.read_csv('festivosm.csv', encoding = 'utf-8', sep =';')

In [40]:
df_festivos['Fecha'] = pd.to_datetime(df_festivos['Fecha'], format="%m/%d/%Y")

In [41]:
DfTrain['FECHA_ACCIDENTE'] = pd.to_datetime(DfTrain['FECHA_ACCIDENTE'], format="%Y/%m/%d")
DfTest['FECHA_ACCIDENTE'] = pd.to_datetime(DfTest['FECHA_ACCIDENTE'], format="%Y/%m/%d")
Dfevalua['FECHA_ACCIDENTE'] = pd.to_datetime(Dfevalua['FECHA_ACCIDENTE'], format="%Y/%m/%d")

In [42]:
DfTrain['festivo'] = DfTrain['FECHA_ACCIDENTE'].apply(lambda x: 1 if x in df_festivos['Fecha'].unique() else 0)
DfTest['festivo'] = DfTest['FECHA_ACCIDENTE'].apply(lambda x: 1 if x in df_festivos['Fecha'].unique() else 0)
Dfevalua['festivo'] = Dfevalua['FECHA_ACCIDENTE'].apply(lambda x: 1 if x in df_festivos['Fecha'].unique() else 0)

In [43]:
DfTrain['Year'] = DfTrain['FECHA_ACCIDENTE'].dt.year
DfTrain['Month'] = DfTrain['FECHA_ACCIDENTE'].dt.month
DfTrain['DayMo'] = DfTrain['FECHA_ACCIDENTE'].dt.day
DfTrain['Dayw'] = DfTrain['FECHA_ACCIDENTE'].apply(lambda x: x.strftime('%A'))

In [44]:
DfTest['Year'] = DfTest['FECHA_ACCIDENTE'].dt.year
DfTest['Month'] = DfTest['FECHA_ACCIDENTE'].dt.month
DfTest['DayMo'] = DfTest['FECHA_ACCIDENTE'].dt.day
DfTest['Dayw'] = DfTest['FECHA_ACCIDENTE'].apply(lambda x: x.strftime('%A'))

In [45]:
Dfevalua['Year'] = Dfevalua['FECHA_ACCIDENTE'].dt.year
Dfevalua['Month'] = Dfevalua['FECHA_ACCIDENTE'].dt.month
Dfevalua['DayMo'] = Dfevalua['FECHA_ACCIDENTE'].dt.day
Dfevalua['Dayw'] = Dfevalua['FECHA_ACCIDENTE'].apply(lambda x: x.strftime('%A'))

In [46]:
DfTest.head()

Unnamed: 0,FECHA_ACCIDENTE,Total,festivo,Year,Month,DayMo,Dayw
0,2018-01-01,1,1,2018,1,1,Monday
1,2018-01-02,6,0,2018,1,2,Tuesday
2,2018-01-03,4,0,2018,1,3,Wednesday
3,2018-01-04,2,0,2018,1,4,Thursday
4,2018-01-05,1,0,2018,1,5,Friday


In [47]:
def quincena(f):
    z = []
    for i,j in zip(f['DayMo'],f['Dayw']):
        if (i in [15,30,31] and j in ['Monday','Tuesday', 'Thursday', 'Friday','Wednesday']):
            z.append(1)
        else:
            z.append(0)
    return z

In [48]:
DfTrain['Quincena'] = quincena(DfTrain)
DfTest['Quincena'] = quincena(DfTest)
Dfevalua['Quincena'] = quincena(Dfevalua)

In [49]:
variab = ['DayMo','Month','Year','festivo','Dayw','Total','Quincena']

In [50]:
DfTrain = DfTrain[variab]
DfTest = DfTest[variab]
Dfevalua = Dfevalua[variab]

In [51]:
Dfevalua.head()

Unnamed: 0,DayMo,Month,Year,festivo,Dayw,Total,Quincena
0,1,1,2020,1,Wednesday,3,0
1,2,1,2020,0,Thursday,7,0
2,3,1,2020,0,Friday,4,0
3,4,1,2020,0,Saturday,6,0
4,6,1,2020,1,Monday,4,0


## One hot encoding y Modelo

In [52]:
DfTrain = pd.get_dummies(DfTrain)
DfTest = pd.get_dummies(DfTest)
Dfevalua = pd.get_dummies(Dfevalua)

In [53]:
num = list(DfTrain.columns.values)
num.remove('Total')

In [54]:
exp_reg101 = setup(data = DfTrain, target = 'Total',
                   numeric_features = num , normalize = True, session_id = 8301)

Unnamed: 0,Description,Value
0,session_id,8301
1,Target,Total
2,Original Data,"(1220, 13)"
3,Missing Values,False
4,Numeric Features,12
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(853, 12)"


In [55]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
br,Bayesian Ridge,1.7961,5.7829,2.3922,0.0231,0.0,0.6623,0.02
lr,Linear Regression,1.7994,5.778,2.3917,0.023,0.0,0.6587,1.384
ridge,Ridge Regression,1.7992,5.8054,2.3972,0.0185,0.0,0.6606,0.016
lar,Least Angle Regression,1.7992,5.8057,2.3973,0.0185,0.0,0.6606,0.025
huber,Huber Regressor,1.7579,5.9006,2.4144,0.0057,0.0,0.5966,0.025
lightgbm,Light Gradient Boosting Machine,0.0,0.0,0.0,0.0,0.0,0.0,0.02
omp,Orthogonal Matching Pursuit,1.8419,5.9545,2.4282,-0.0071,0.0,0.6835,0.024
en,Elastic Net,1.8437,6.0053,2.4384,-0.0152,0.0,0.6888,0.021
llar,Lasso Least Angle Regression,1.8437,6.0053,2.4384,-0.0152,0.0,0.6888,0.017
lasso,Lasso Regression,1.8437,6.0053,2.4384,-0.0152,0.0,0.6888,0.021


BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None,
              compute_score=False, copy_X=True, fit_intercept=True,
              lambda_1=1e-06, lambda_2=1e-06, lambda_init=None, n_iter=300,
              normalize=False, tol=0.001, verbose=False)

In [56]:
rfj = create_model('rf')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,1.678,4.2587,2.0637,-0.0987,-0.0,0.734
1,2.076,6.4241,2.5346,-0.0053,-0.0,0.7104
2,1.8673,5.052,2.2477,-0.0272,-0.0,0.7548
3,1.9981,6.485,2.5466,-0.0434,-0.0,0.7225
4,1.894,6.8176,2.6111,-0.1553,-0.0,0.6965
5,1.8001,6.2332,2.4966,-0.0138,-0.0,0.5916
6,2.2587,8.7216,2.9532,-0.2612,-0.0,0.8231
7,2.1061,6.9438,2.6351,-0.1434,-0.0,0.6365
8,1.9549,5.7424,2.3963,-0.2745,-0.0,0.7179
9,2.0578,8.8763,2.9793,-0.0747,-0.0,0.8668


In [57]:
tuned_rfj = tune_model(rfj)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,1.5731,3.6532,1.9113,0.0575,-0.0,0.6962
1,1.9973,5.9715,2.4437,0.0655,-0.0,0.6957
2,1.6497,4.5566,2.1346,0.0736,-0.0,0.6763
3,1.8348,5.8149,2.4114,0.0644,-0.0,0.6572
4,1.7195,5.8198,2.4124,0.0138,-0.0,0.6589
5,1.7418,5.8648,2.4217,0.0461,-0.0,0.5559
6,1.9736,7.1482,2.6736,-0.0337,-0.0,0.7338
7,1.9911,6.5334,2.5561,-0.0758,-0.0,0.6152
8,1.6872,4.3454,2.0846,0.0356,-0.0,0.5797
9,1.8174,7.9274,2.8156,0.0402,-0.0,0.7742


In [58]:
tuned_rfj

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=6, max_features='log2', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.005,
                      min_impurity_split=None, min_samples_leaf=2,
                      min_samples_split=10, min_weight_fraction_leaf=0.0,
                      n_estimators=190, n_jobs=-1, oob_score=False,
                      random_state=8301, verbose=0, warm_start=False)

In [59]:
predict_model(tuned_rfj)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Random Forest Regressor,1.9288,6.4612,2.5419,0.0675,0.4809,0.7321


Unnamed: 0,DayMo,Month,Year,festivo,Quincena,Dayw_Friday,Dayw_Monday,Dayw_Saturday,Dayw_Sunday,Dayw_Thursday,Dayw_Tuesday,Dayw_Wednesday,Total,Label
0,1.605123,0.905161,1.214362,-0.230405,4.050699,-0.420181,2.347648,-0.400680,-0.398707,-0.398707,-0.408527,-0.404612,1,4.992981
1,-0.874236,0.028104,0.239902,-0.230405,-0.246871,2.379929,-0.425958,-0.400680,-0.398707,-0.398707,-0.408527,-0.404612,1,4.492951
2,-1.212331,-1.141305,-0.734557,-0.230405,-0.246871,-0.420181,-0.425958,-0.400680,-0.398707,2.508106,-0.408527,-0.404612,2,4.200570
3,0.703538,-0.556601,-0.734557,-0.230405,-0.246871,2.379929,-0.425958,-0.400680,-0.398707,-0.398707,-0.408527,-0.404612,5,4.114639
4,-1.437727,-1.433658,-0.734557,-0.230405,-0.246871,-0.420181,-0.425958,-0.400680,-0.398707,-0.398707,2.447816,-0.404612,1,3.579218
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,-0.423444,0.905161,-0.734557,4.340186,-0.246871,-0.420181,2.347648,-0.400680,-0.398707,-0.398707,-0.408527,-0.404612,1,4.132985
363,-1.099633,0.028104,0.239902,-0.230405,-0.246871,-0.420181,-0.425958,-0.400680,-0.398707,-0.398707,-0.408527,2.471504,4,4.507517
364,0.703538,1.489866,-1.709017,-0.230405,-0.246871,-0.420181,2.347648,-0.400680,-0.398707,-0.398707,-0.408527,-0.404612,3,3.297429
365,-0.085349,-0.556601,0.239902,-0.230405,-0.246871,-0.420181,-0.425958,-0.400680,2.508106,-0.398707,-0.408527,-0.404612,3,3.238361


In [60]:
labeltr = DfTrain['Total']
labeltest = DfTest['Total']
DfTrain = DfTrain.drop('Total', axis = 1)
DfTest = DfTest.drop('Total', axis = 1)

In [61]:
forest = tuned_rfj.fit(DfTrain,labeltr)

## Rmse entrenamiento

In [62]:
ypredTrain = forest.predict(DfTrain)

In [63]:
(mse(ypredTrain,labeltr))**(1/2)

2.34690415274137

## Rmse Test

In [64]:
ypredTest = forest.predict(DfTest)

In [65]:
(mse(ypredTest,labeltest))**(1/2)

2.5265546845240756

## Rmse Evaluación

In [66]:
labelevalua = Dfevalua['Total']

In [67]:
Dfevalua = Dfevalua.drop(['Total'], axis = 1)

In [68]:
yevalua = forest.predict(Dfevalua)

In [69]:
(mse(yevalua,labelevalua))**(1/2)

3.235968430492066

## Se guarda modelo

In [70]:
joblib.dump(forest, 'modelo_Volcamiento_entrenado.pkl') # Se guarda el modelo.

['modelo_Volcamiento_entrenado.pkl']