<a href="https://colab.research.google.com/github/Rogerio-mack/work/blob/main/GLM_versus_ML_GAS_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predições GLM $\times$ ML  

1. Para as quantidades de pedidos
2. Para o elapse time (tempo da entrega)



## Conclusão

Para as quantidades de pedidos o modelo de ML (MAPE $39$%) se mostra bastante superior ao modelo GLM (MAPE $154$%), para o elapse time, ambos modelos apresentam um resultado limitado (MAPE $60$%).

**Modelagem da QUANTIDADE DE PEDIDOS**
```
GLM             ML
------------------------------
RMSE: 14.18     RMSE: 7.47
MSE: 201.18     MSE: 55.79
MAE: 10.54      MAE: 5.65
MAPE: 154.12%   MAPE: 39.36%
```
**Modelagem do ELAPSE TIME**
```
GLM             ML
------------------------------
MAPE: 60.63%    MAPE: 60.20%
```

Para a modelagem do ELAPSE TIME a sugestão é empregar janelas de tempo para predições eliminando a tendência de longo prazo de queda dos tempos de entrega observada no último ano.

# Imports

In [152]:
import pandas as pd

In [153]:
from sklearn.linear_model import LinearRegression  # Regressão Linear Simples
from sklearn.linear_model import Ridge  # Regressão Ridge
from sklearn.linear_model import Lasso  # Regressão Lasso
from sklearn.linear_model import ElasticNet  # Regressão ElasticNet
from sklearn.linear_model import BayesianRidge  # Regressão Bayesiana Ridge
from sklearn.linear_model import SGDRegressor  # Regressão por Gradiente Descendente Estocástico
from sklearn.linear_model import HuberRegressor  # Regressão de Huber
from sklearn.linear_model import PassiveAggressiveRegressor  # Regressão Passiva-Agressiva
from sklearn.linear_model import TheilSenRegressor  # Regressão Theil-Sen
from sklearn.linear_model import ARDRegression  # Regressão de Divergência Automática Relevante
from sklearn.linear_model import OrthogonalMatchingPursuit  # Regressão Orthogonal Matching Pursuit

from sklearn.svm import SVR  # Máquinas de Vetores de Suporte para Regressão
from sklearn.neighbors import KNeighborsRegressor  # Regressão dos k-vizinhos mais próximos
from sklearn.tree import DecisionTreeRegressor  # Regressão com Árvores de Decisão
from sklearn.ensemble import RandomForestRegressor  # Regressão com Florestas Aleatórias
from sklearn.ensemble import GradientBoostingRegressor  # Regressão com Gradient Boosting
from sklearn.neural_network import MLPRegressor  # Regressão com Redes Neurais MLP

In [154]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error

def stats_error(y_test, y_pred):
  # Calcula o Root Mean Squared Error (RMSE)
  rmse = np.sqrt(mean_squared_error(y_test, y_pred))

  # Calcula o Mean Squared Error (MSE)
  mse = mean_squared_error(y_test, y_pred)

  # Calcula o Mean Absolute Error (MAE)
  mae = mean_absolute_error(y_test, y_pred)

  # Calcula o Mean Absolute Percentage Error (MAPE)
  mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

  # Imprime as métricas
  print(f'RMSE: {rmse:.2f}')
  print(f'MSE: {mse:.2f}')
  print(f'MAE: {mae:.2f}')
  print(f'MAPE: {mape:.2f}%')

  return


# Modelando e estimando a QUANTIDADE DE PEDIDOS


In [155]:
df = pd.read_excel('https://github.com/Rogerio-mack/work/raw/main/dataset_gas.xlsx',sheet_name='baseModelagem')
df.head()

Unnamed: 0,ano,mes,periodo,semestre,dia_semana,quantidade
0,2022,9,manha,2sem,friday,2
1,2022,9,manha,2sem,monday,2
2,2022,9,manha,2sem,saturday,5
3,2022,9,manha,2sem,sunday,3
4,2022,9,manha,2sem,thursday,10


# Data Preparation

In [156]:
df = df.drop(columns=['ano','semestre'])

In [157]:
from sklearn.preprocessing import OneHotEncoder

hot_encode = OneHotEncoder(handle_unknown='ignore',sparse_output=False,drop='first')
hot_encode.fit(df.drop(columns='quantidade').select_dtypes(exclude='number'))

df_hot_encode = pd.DataFrame(hot_encode.transform(df.drop(columns='quantidade').select_dtypes(exclude='number')),columns=hot_encode.get_feature_names_out())
df_hot_encode.head()

df = pd.concat([df_hot_encode,df.select_dtypes('number')],axis=1)
df.head()

Unnamed: 0,periodo_noite,periodo_tarde,dia_semana_monday,dia_semana_saturday,dia_semana_sunday,dia_semana_thursday,dia_semana_tuesday,dia_semana_wednesday,mes,quantidade
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9,2
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,9,2
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,9,5
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,9,3
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9,10


In [158]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = StandardScaler()
scaler.fit(df.drop(columns='quantidade'))

df_scaled = scaler.transform(df.drop(columns='quantidade'))
df_scaled = pd.DataFrame(df_scaled, columns=df.drop(columns='quantidade').columns)

df_scaled = pd.concat([df_scaled,df[['quantidade']]],axis=1)

df = df_scaled
df.head()

Unnamed: 0,periodo_noite,periodo_tarde,dia_semana_monday,dia_semana_saturday,dia_semana_sunday,dia_semana_thursday,dia_semana_tuesday,dia_semana_wednesday,mes,quantidade
0,-0.644981,-0.734255,-0.414952,-0.424264,-0.32596,-0.424264,-0.414952,-0.424264,0.684486,2
1,-0.644981,-0.734255,2.409915,-0.424264,-0.32596,-0.424264,-0.414952,-0.424264,0.684486,2
2,-0.644981,-0.734255,-0.414952,2.357023,-0.32596,-0.424264,-0.414952,-0.424264,0.684486,5
3,-0.644981,-0.734255,-0.414952,-0.424264,3.06786,-0.424264,-0.414952,-0.424264,0.684486,3
4,-0.644981,-0.734255,-0.414952,-0.424264,-0.32596,2.357023,-0.414952,-0.424264,0.684486,10


In [159]:
X = df.drop(columns='quantidade')
y = df.quantidade

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# GLM

In [160]:
import statsmodels.api as sm

X_train_const = X_train
X_train_const = sm.add_constant(X_train_const)

# Ajuste o modelo de Regressão Linear Generalizada (GLM)
glm_model = sm.GLM(y_train, X_train_const).fit()

# Exiba os resultados do modelo
print(glm_model.summary())


                 Generalized Linear Model Regression Results                  
Dep. Variable:             quantidade   No. Observations:                  123
Model:                            GLM   Df Residuals:                      113
Model Family:                Gaussian   Df Model:                            9
Link Function:               Identity   Scale:                          164.39
Method:                          IRLS   Log-Likelihood:                -483.10
Date:                Fri, 22 Sep 2023   Deviance:                       18576.
Time:                        01:27:35   Pearson chi2:                 1.86e+04
No. Iterations:                     3   Pseudo R-squ. (CS):             0.6320
Covariance Type:            nonrobust                                         
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   27.6534 

## Results

In [161]:
X_test_const = X_test
X_test_const = sm.add_constant(X_test)

stats_error(y_test, glm_model.predict(X_test_const))

RMSE: 14.18
MSE: 201.18
MAE: 10.54
MAPE: 154.12%


# ML

In [162]:
models = [ ElasticNet(),
           BayesianRidge(),
           SGDRegressor(),
           ARDRegression(),
           SVR(),
           KNeighborsRegressor(),
           DecisionTreeRegressor(),
           RandomForestRegressor(),
           GradientBoostingRegressor(),
#           MLPRegressor(max_iter=5000),
           ]

In [163]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

acc_scores = {}

for model in models:
  reg = model
  acc_scores[model] = cross_val_score(reg, X_train, y_train, cv = 10)


In [164]:
pd.DataFrame(acc_scores).mean().sort_values(ascending=False)

GradientBoostingRegressor()    0.645464
RandomForestRegressor()        0.588869
ARDRegression()                0.415784
BayesianRidge()                0.392964
ElasticNet()                   0.383857
SGDRegressor()                 0.383149
KNeighborsRegressor()          0.301103
DecisionTreeRegressor()        0.275171
SVR()                          0.213681
dtype: float64

In [165]:
ml_model = pd.DataFrame(acc_scores).mean().sort_values(ascending=False).index[0]
ml_model.fit(X_train,y_train)

## Results

In [166]:
stats_error(y_test, ml_model.predict(X_test))

RMSE: 7.47
MSE: 55.79
MAE: 5.65
MAPE: 39.36%


# Modelando e estimando o ELAPSE TIME (tempo da entrega)

In [125]:
df = pd.read_excel('https://github.com/Rogerio-mack/work/raw/main/dataset_gas.xlsx',sheet_name='dataset')
df.head()

Unnamed: 0,day,month,year,hour,hora,periodo,data,semestre,mes,week_day,...,address_latitude,address_longitude,company_latitude,company_longitude,ongoing_orders,average_delivery_time,reseller,driver,waze_avg_time,waze_avg_distance
0,12,5,2023,19:11:00,19,noite,2023-05-12,1sem,maio,friday,...,-23.478041,-46.677703,-23.483915,-46.687316,3,40.831067,617bffde26c21e0004ba123f,617d251f1157df00045e48ce,2.683333,1.658
1,21,5,2023,10:29:00,10,manha,2023-05-21,1sem,maio,sunday,...,-23.483071,-46.726242,-23.483915,-46.687316,2,41.2771,617bffde26c21e0004ba123f,617d251f1157df00045e48ce,14.266667,5.654
2,21,5,2023,08:58:00,8,manha,2023-05-21,1sem,maio,sunday,...,-23.475527,-46.688663,-23.483915,-46.687316,1,41.28305,617bffde26c21e0004ba123f,617d251f1157df00045e48ce,2.65,1.328
3,20,5,2023,17:13:00,17,tarde,2023-05-20,1sem,maio,saturday,...,-23.482847,-46.670429,-23.483915,-46.687316,2,41.287183,617bffde26c21e0004ba123f,617d251f1157df00045e48ce,5.9,3.004
4,20,5,2023,16:58:00,16,tarde,2023-05-20,1sem,maio,saturday,...,-23.48884,-46.71356,-23.483915,-46.687316,2,41.296483,617bffde26c21e0004ba123f,617d251f1157df00045e48ce,10.35,3.64


# Data Preparation

In [123]:
df.iloc[0]

day                                            12
month                                           5
year                                         2023
hour                                     19:11:00
hora                                           19
periodo                                     noite
data                          2023-05-12 00:00:00
semestre                                     1sem
mes                                          maio
week_day                                   friday
quantity                                        1
discount                                     True
holiday                                     False
pre_holiday                                 False
post_holiday                                False
time_elapsed                              16.3716
distance                                 1.178088
address_latitude                       -23.478041
address_longitude                      -46.677703
company_latitude                       -23.483915


In [126]:
df = df[['day','month','year','hora','ongoing_orders','week_day','quantity','reseller','distance','holiday','pre_holiday','post_holiday','time_elapsed']]
df.head()

Unnamed: 0,day,month,year,hora,ongoing_orders,week_day,quantity,reseller,distance,holiday,pre_holiday,post_holiday,time_elapsed
0,12,5,2023,19,3,friday,1,617bffde26c21e0004ba123f,1.178088,False,False,False,16.3716
1,21,5,2023,10,2,sunday,1,617bffde26c21e0004ba123f,3.970976,False,False,False,8.3323
2,21,5,2023,8,1,sunday,1,617bffde26c21e0004ba123f,0.94277,False,False,False,14.8499
3,20,5,2023,17,2,saturday,1,617bffde26c21e0004ba123f,1.726385,False,False,False,13.449717
4,20,5,2023,16,2,saturday,1,617bffde26c21e0004ba123f,2.731804,False,False,False,18.72685


In [128]:
from sklearn.preprocessing import OneHotEncoder

hot_encode = OneHotEncoder(handle_unknown='ignore',sparse_output=False,drop='first')
hot_encode.fit(df.drop(columns='time_elapsed').select_dtypes(exclude='number'))

df_hot_encode = pd.DataFrame(hot_encode.transform(df.drop(columns='time_elapsed').select_dtypes(exclude='number')),columns=hot_encode.get_feature_names_out())
df_hot_encode.head()

df = pd.concat([df_hot_encode,df.select_dtypes('number')],axis=1)
df.head()

Unnamed: 0,week_day_monday,week_day_saturday,week_day_sunday,week_day_thursday,week_day_tuesday,week_day_wednesday,reseller_61fbc08bd22f1a00040c0af0,reseller_624c7e4995b998000410fc06,reseller_6303c2fe17bbb50004b45dd5,reseller_63321fdace40cf0004297511,...,pre_holiday_True,post_holiday_True,day,month,year,hora,ongoing_orders,quantity,distance,time_elapsed
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,12,5,2023,19,3,1,1.178088,16.3716
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,21,5,2023,10,2,1,3.970976,8.3323
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,21,5,2023,8,1,1,0.94277,14.8499
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,20,5,2023,17,2,1,1.726385,13.449717
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,20,5,2023,16,2,1,2.731804,18.72685


In [130]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = StandardScaler()
scaler.fit(df.drop(columns='time_elapsed'))

df_scaled = scaler.transform(df.drop(columns='time_elapsed'))
df_scaled = pd.DataFrame(df_scaled, columns=df.drop(columns='time_elapsed').columns)

df_scaled = pd.concat([df_scaled,df[['time_elapsed']]],axis=1)

df = df_scaled
df.head()

Unnamed: 0,week_day_monday,week_day_saturday,week_day_sunday,week_day_thursday,week_day_tuesday,week_day_wednesday,reseller_61fbc08bd22f1a00040c0af0,reseller_624c7e4995b998000410fc06,reseller_6303c2fe17bbb50004b45dd5,reseller_63321fdace40cf0004297511,...,pre_holiday_True,post_holiday_True,day,month,year,hora,ongoing_orders,quantity,distance,time_elapsed
0,-0.415828,-0.418837,-0.395208,-0.397811,-0.414204,-0.405223,-0.242877,-0.187958,-0.581557,-0.574761,...,-0.191329,-0.173914,-0.395893,-0.266964,0.830298,1.366285,1.539572,-0.192106,-0.947061,16.3716
1,-0.415828,-0.418837,2.530314,-0.397811,-0.414204,-0.405223,-0.242877,-0.187958,-0.581557,-0.574761,...,-0.191329,-0.173914,0.671611,-0.266964,0.830298,-0.981276,0.752948,-0.192106,0.067428,8.3323
2,-0.415828,-0.418837,2.530314,-0.397811,-0.414204,-0.405223,-0.242877,-0.187958,-0.581557,-0.574761,...,-0.191329,-0.173914,0.671611,-0.266964,0.830298,-1.502956,-0.033676,-0.192106,-1.032538,14.8499
3,-0.415828,2.387566,-0.395208,-0.397811,-0.414204,-0.405223,-0.242877,-0.187958,-0.581557,-0.574761,...,-0.191329,-0.173914,0.553,-0.266964,0.830298,0.844605,0.752948,-0.192106,-0.747898,13.449717
4,-0.415828,2.387566,-0.395208,-0.397811,-0.414204,-0.405223,-0.242877,-0.187958,-0.581557,-0.574761,...,-0.191329,-0.173914,0.553,-0.266964,0.830298,0.583765,0.752948,-0.192106,-0.382689,18.72685


In [134]:
X = df.drop(columns='time_elapsed')
y = df.time_elapsed

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# GLM

In [135]:
import statsmodels.api as sm

X_train_const = X_train
X_train_const = sm.add_constant(X_train_const)

# Ajuste o modelo de Regressão Linear Generalizada (GLM)
glm_model = sm.GLM(y_train, X_train_const).fit()

# Exiba os resultados do modelo
print(glm_model.summary())


                 Generalized Linear Model Regression Results                  
Dep. Variable:           time_elapsed   No. Observations:                 9990
Model:                            GLM   Df Residuals:                     9967
Model Family:                Gaussian   Df Model:                           22
Link Function:               Identity   Scale:                          145.93
Method:                          IRLS   Log-Likelihood:                -39054.
Date:                Fri, 22 Sep 2023   Deviance:                   1.4545e+06
Time:                        01:11:22   Pearson chi2:                 1.45e+06
No. Iterations:                     3   Pseudo R-squ. (CS):            0.07559
Covariance Type:            nonrobust                                         
                                        coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

In [137]:
glm_model.pvalues[glm_model.pvalues < 0.05]

const                                0.000000e+00
week_day_sunday                      3.056103e-07
reseller_61fbc08bd22f1a00040c0af0    5.072576e-03
reseller_6303c2fe17bbb50004b45dd5    5.483851e-11
reseller_63402732bc3b9f0004842ed3    3.720688e-33
post_holiday_True                    2.802108e-04
hora                                 2.181098e-08
ongoing_orders                       3.508257e-78
distance                             1.554277e-10
dtype: float64

In [141]:
sel_columns = [ x for x in df.columns if x in list(glm_model.pvalues[glm_model.pvalues < 0.05].index) ]
df_glm = pd.concat([df[sel_columns], df[['time_elapsed']]],axis=1)
df_glm.head()

Unnamed: 0,week_day_sunday,reseller_61fbc08bd22f1a00040c0af0,reseller_6303c2fe17bbb50004b45dd5,reseller_63402732bc3b9f0004842ed3,post_holiday_True,hora,ongoing_orders,distance,time_elapsed
0,-0.395208,-0.242877,-0.581557,-0.260813,-0.173914,1.366285,1.539572,-0.947061,16.3716
1,2.530314,-0.242877,-0.581557,-0.260813,-0.173914,-0.981276,0.752948,0.067428,8.3323
2,2.530314,-0.242877,-0.581557,-0.260813,-0.173914,-1.502956,-0.033676,-1.032538,14.8499
3,-0.395208,-0.242877,-0.581557,-0.260813,-0.173914,0.844605,0.752948,-0.747898,13.449717
4,-0.395208,-0.242877,-0.581557,-0.260813,-0.173914,0.583765,0.752948,-0.382689,18.72685


In [144]:
X = df_glm.drop(columns='time_elapsed')
y = df_glm.time_elapsed

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [145]:
import statsmodels.api as sm

X_train_const = X_train
X_train_const = sm.add_constant(X_train_const)

# Ajuste o modelo de Regressão Linear Generalizada (GLM)
glm_model = sm.GLM(y_train, X_train_const).fit()

# Exiba os resultados do modelo
print(glm_model.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:           time_elapsed   No. Observations:                 9990
Model:                            GLM   Df Residuals:                     9981
Model Family:                Gaussian   Df Model:                            8
Link Function:               Identity   Scale:                          146.15
Method:                          IRLS   Log-Likelihood:                -39069.
Date:                Fri, 22 Sep 2023   Deviance:                   1.4588e+06
Time:                        01:19:06   Pearson chi2:                 1.46e+06
No. Iterations:                     3   Pseudo R-squ. (CS):            0.07276
Covariance Type:            nonrobust                                         
                                        coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

## Results

In [150]:
X_test_const = X_test
X_test_const = sm.add_constant(X_test)

stats_error(y_test, glm_model.predict(X_test_const))

RMSE: 12.35
MSE: 152.47
MAE: 10.18
MAPE: 60.63%


# ML

In [146]:
models = [ ElasticNet(),
           BayesianRidge(),
           SGDRegressor(),
           ARDRegression(),
           SVR(),
           KNeighborsRegressor(),
           DecisionTreeRegressor(),
           RandomForestRegressor(),
           GradientBoostingRegressor(),
#           MLPRegressor(max_iter=5000),
           ]

In [147]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

acc_scores = {}

for model in models:
  reg = model
  acc_scores[model] = cross_val_score(reg, X_train, y_train, cv = 10)


In [148]:
pd.DataFrame(acc_scores).mean().sort_values(ascending=False)

GradientBoostingRegressor()    0.073977
SGDRegressor()                 0.067536
BayesianRidge()                0.067195
ARDRegression()                0.067098
SVR()                          0.058287
ElasticNet()                   0.043938
KNeighborsRegressor()         -0.102507
RandomForestRegressor()       -0.133228
DecisionTreeRegressor()       -0.863619
dtype: float64

In [149]:
ml_model = pd.DataFrame(acc_scores).mean().sort_values(ascending=False).index[0]
ml_model.fit(X_train,y_train)

## Results

In [151]:
stats_error(y_test, ml_model.predict(X_test))

RMSE: 12.27
MSE: 150.59
MAE: 10.09
MAPE: 60.20%
