In [1]:
import pyodbc
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import MaxAbsScaler

import warnings
warnings.filterwarnings("ignore")

from pycaret.time_series import *
from datetime import datetime

##### Funcion para descargar los datos y EDA del cluster

In [2]:
def df_cluster(nits_clientes, fecha_final):
    # Conexion al dwh
    cnxn = pyodbc.connect(
        driver='{SQL Server}',
        server='192.168.100.58',
        uid='bilectura',
        pwd='D1sp@p3l3s')
    cursor = cnxn.cursor()

    df_SQL_nits = pd.DataFrame()

    for nit in nits_clientes:
        #Consulta SQL
        # consulta_SQL = f"SELECT DATEFROMPARTS(VTAANO, VTAMES, 1) AS 'Fecha', CONCAT(CONCAT(VTANIT, '-'), VTASUC) AS 'Nitcliente-sucursal', SUM(VTAVLRVTA) AS 'Ventas' FROM V_VTA_VTAHEC WHERE CONCAT(CONCAT(VTANIT, '-'), VTASUC) = '{nit}' AND VTAFCH BETWEEN '2021-01-01' AND '{fecha_final}' GROUP BY DATEFROMPARTS(VTAANO, VTAMES, 1), CONCAT(CONCAT(VTANIT, '-'), VTASUC)"
        consulta_SQL = f"SELECT DATEFROMPARTS(VTAANO, VTAMES, 1) AS 'Fecha', CONCAT(CONCAT(VTANIT, '-'), VTASUC) AS 'Nitcliente-sucursal', SUM(VTAVLRVTA) AS 'Ventas' FROM V_VTA_VTAHEC WHERE CONCAT(CONCAT(VTANIT, '-'), VTASUC) = '{nit}' AND VTAFCH < '{fecha_final}' GROUP BY DATEFROMPARTS(VTAANO, VTAMES, 1), CONCAT(CONCAT(VTANIT, '-'), VTASUC)"

        #Carga de la data desde el dwh de Dispapeles y se guarda en df
        cursor.execute(consulta_SQL)
        rows = cursor.fetchall()
        df_SQL_int = pd.DataFrame.from_records(rows, columns=[col[0] for col in cursor.description])
        df_SQL_int["Ventas"] = df_SQL_int["Ventas"].astype(int)
        df_SQL_int["Fecha"] = pd.to_datetime(df_SQL_int["Fecha"])

        df_SQL_nits = pd.concat([df_SQL_nits, df_SQL_int], ignore_index= True)

    df_SQL = df_SQL_nits.groupby("Fecha").sum().reset_index()
    df_SQL_nits = df_SQL_nits.groupby("Nitcliente-sucursal").sum().reset_index()

    return df_SQL, df_SQL_nits

In [3]:
def lineplot(bd):
    x = bd["Fecha"]
    x_n = np.arange(0, len(bd))
    y = bd["Ventas"]   
    coeficientes = np.polyfit(x_n, y, 1)
    poli = np.poly1d(coeficientes)

    trace1 = go.Scatter(x=x, y=y, mode='lines+markers', name='Ventas')
    trace2 = go.Scatter(x=x, y=poli(x_n), mode='lines', name='Línea de Tendencia')

    layout = go.Layout(
            title='Ventas por mes',
            xaxis=dict(title='Fecha'),
            yaxis=dict(title='Ventas'),
            legend=dict(x=1, y=1)
    )

    fig = go.Figure()
    fig.add_trace(trace1)
    fig.add_trace(trace2)
    fig.update_layout(layout)
    fig.show()

def EDA_cluster(bd, bd_nits):
    #Variables
    bd = bd.reset_index()
    bd_nits = bd_nits
    primer_fecha = datetime.utcfromtimestamp(bd.iloc[0, 1].timestamp())
    ultima_fecha = datetime.utcfromtimestamp(bd.iloc[-1, 1].timestamp())
    describe_bd = bd.describe().applymap("{:,.0f}".format)
    describe_bd_nits = bd_nits.describe().applymap("{:,.0f}".format)
    
    print(f"Esta base de datos tiene ventas de {len(bd)} meses,")
    print(f"empezando desde el {primer_fecha.strftime('%d-%m-%Y')}")
    print(f"y terminando el {ultima_fecha.strftime('%d-%m-%Y')}")
    print("La composicion estadistica de la base de datos es la siguiente:")
    print(describe_bd["Ventas"][1:])
    print(" ")

    print(f"Por otro lado, esta compuesto por ventas de {len(bd_nits)} clientes")
    print("Y asi se comporta estadisticamente asi:")
    print(describe_bd_nits["Ventas"][1:])
    print(" ")


    lineplot(bd)

##### Carga de los clusters

In [4]:
df_clusters = pd.read_csv("C:/Users/tcardenas/OneDrive/OneDrive - Grupo DISPAPELES/Documents/ML-Dispapeles-TomasCaLo/Clustering/Clustering 12-04-23.csv",
                            encoding= 'utf-8', decimal= ",", sep= ";")
col_eliminar = ["Escala R", "Escala M", "Escala F", "Distrito-Nombretipozona", "Cluster"]
df_clusters = df_clusters.drop(col_eliminar, axis= 1)

#El mejor modelo es 10-Institucional-A
filtro_distrito = 10
filtro_tipozona = "Institucional"
filtro_cluster = "A"

df_clusters_f = df_clusters[
                            (df_clusters["Codigo distrito"] == filtro_distrito) &
                            (df_clusters["Nombre tipo zona"] == filtro_tipozona) &
                            (df_clusters["Letra cluster"] == filtro_cluster)
                            ]

##### EDA del cluster elegido

In [5]:
df_clusters_EDA = df_clusters.groupby(["Codigo distrito", "Nombre tipo zona", "Letra cluster"]).agg({"Nit cliente-sucursal": np.size}).reset_index()

In [6]:
df_clusters_EDA

Unnamed: 0,Codigo distrito,Nombre tipo zona,Letra cluster,Nit cliente-sucursal
0,10,Artes graficas,A,145
1,10,Artes graficas,B,94
2,10,Artes graficas,C,120
3,10,Artes graficas,D,79
4,10,Artes graficas,E,85
...,...,...,...,...
330,90,Mayorista,A,14
331,90,Mayorista,B,23
332,90,Mayorista,C,33
333,90,Mayorista,D,40


In [7]:
lista_nits = df_clusters_f["Nit cliente-sucursal"].tolist()
fecha_final = '2023-03-31'
fecha_final = datetime.strptime(fecha_final, '%Y-%m-%d').strftime('%Y-%m-%d')

In [8]:
ventas_cluster, ventas_nits = df_cluster(nits_clientes= lista_nits, fecha_final= fecha_final)

In [9]:
EDA_cluster(ventas_cluster, ventas_nits)

Esta base de datos tiene ventas de 63 meses,
empezando desde el 01-01-2018
y terminando el 01-03-2023
La composicion estadistica de la base de datos es la siguiente:
mean    1,398,473,915
std       383,494,469
min       589,893,056
25%     1,155,690,522
50%     1,409,209,637
75%     1,638,192,058
max     2,221,545,582
Name: Ventas, dtype: object
 
Por otro lado, esta compuesto por ventas de 221 clientes
Y asi se comporta estadisticamente asi:
mean       398,659,985
std      1,867,746,054
min          1,273,370
25%         11,126,108
50%         26,961,701
75%         98,284,865
max     24,380,149,839
Name: Ventas, dtype: object
 


##### Pruebas

In [10]:
fig_kwargs = {
    # "renderer": "notebook",
    "renderer": "png",
    "width": 1000,
    "height": 600,}

In [11]:
s = setup(
            ventas_cluster, #df
            target= "Ventas",
            index= "Fecha",
            session_id = 42, #id para mantener replicabilidad
            transform_target= 'exp', #transformador del target, "box-cox", "log", "sqrt", "exp", "cos"
            coverage= 0.9, #intervalos
            fh = 6,
            use_gpu= True,
            verbose= True,
            hyperparameter_split= 'train'
            # seasonal_period= 'Q'
            )

In [12]:
s.plot_model(plot= "acf")

In [13]:
s.plot_model(plot= "pacf", data_kwargs= {"nlags": 24})

In [14]:
s.plot_model(
    plot="diff",
    data_kwargs={"lags_list": [[1], [1, 3]], "acf": True, "pacf": True, "periodogram": True},
    fig_kwargs={"height": 800, "width": 1500}
)

In [15]:
s.check_stats(test='stationarity')

Unnamed: 0,Test,Test Name,Data,Property,Setting,Value
0,Stationarity,ADF,Transformed,Stationarity,{'alpha': 0.05},False
1,Stationarity,ADF,Transformed,p-value,{'alpha': 0.05},0.387728
2,Stationarity,ADF,Transformed,Test Statistic,{'alpha': 0.05},-1.785413
3,Stationarity,ADF,Transformed,Critical Value 1%,{'alpha': 0.05},-3.548494
4,Stationarity,ADF,Transformed,Critical Value 5%,{'alpha': 0.05},-2.912837
5,Stationarity,ADF,Transformed,Critical Value 10%,{'alpha': 0.05},-2.594129
6,Stationarity,KPSS,Transformed,Trend Stationarity,{'alpha': 0.05},True
7,Stationarity,KPSS,Transformed,p-value,{'alpha': 0.05},0.1
8,Stationarity,KPSS,Transformed,Test Statistic,{'alpha': 0.05},0.084538
9,Stationarity,KPSS,Transformed,Critical Value 10%,{'alpha': 0.05},0.119


In [68]:
# Aplicar suavización exponencial simple
alpha = 0.4
ventas_cluster["Suavizado"] = ventas_cluster["Ventas"].ewm(alpha=alpha).mean() 

In [69]:
# Visualizar la serie original y suavizada
x = ventas_cluster["Fecha"]
y1 = ventas_cluster["Ventas"]
y2 = ventas_cluster["Suavizado"]
trace1 = go.Scatter(x= x, y= y1, mode= "lines+markers", name= "Ventas")
trace2 = go.Scatter(x= x, y= y2, mode= "lines+markers", name= "Suavizado")

layout = go.Layout(
        title='Ventas por mes',
        xaxis=dict(title='Fecha'),
        yaxis=dict(title='Ventas'),
        legend=dict(x=1, y=1)
        )
fig = go.Figure()
fig.add_trace(trace1)
fig.add_trace(trace2)
fig.update_layout(layout)
fig.show()

##### Configuracion y prediccion

In [70]:
s = setup(
            ventas_cluster, #df
            target= "Suavizado",
            ignore_features= "Ventas",
            index= "Fecha",
            session_id = 42, #id para mantener replicabilidad
            transform_target= None, #transformador del target, "box-cox", "log", "sqrt", "exp", "cos"
            coverage= 0.9, #intervalos
            fh = 6,
            use_gpu= True,
            verbose= True,
            hyperparameter_split= 'all'
            # seasonal_period= 'Q'
            )

In [71]:
# remove_metric('RMSE')
# remove_metric('RMSSE')
# remove_metric('SMAPE')
# remove_metric('MAE')
top_3 = compare_models(
                        n_select= 5,
                        sort= "r2"
                        )
metricas_completas = pull()

In [72]:
# Establecer el formato de visualización de números en notación decimal
pd.options.display.float_format = '{:,.3f}'.format
metricas_completas
# metricas_completas[:3]

Unnamed: 0,Model,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2,TT (Sec)
arima,ARIMA,0.467,0.407,28977925.548,36137388.745,0.017,0.017,0.682,0.11
auto_arima,Auto ARIMA,0.474,0.412,29393586.35,36614018.54,0.018,0.017,0.669,0.9
huber_cds_dt,Huber w/ Cond. Deseasonalize & Detrending,0.759,0.654,47132305.915,58015794.044,0.028,0.028,0.177,0.21
omp_cds_dt,Orthogonal Matching Pursuit w/ Cond. Deseasona...,0.867,0.721,54110480.433,64294645.569,0.032,0.032,-0.186,0.153
rf_cds_dt,Random Forest w/ Cond. Deseasonalize & Detrending,0.879,0.75,54990766.932,66928920.922,0.033,0.033,-0.355,0.35
lasso_cds_dt,Lasso w/ Cond. Deseasonalize & Detrending,0.924,0.747,57778685.476,66731499.136,0.035,0.035,-0.415,0.197
lr_cds_dt,Linear w/ Cond. Deseasonalize & Detrending,0.924,0.747,57778685.476,66731499.136,0.035,0.035,-0.415,0.213
en_cds_dt,Elastic Net w/ Cond. Deseasonalize & Detrending,0.924,0.747,57778685.476,66731499.136,0.035,0.035,-0.415,0.207
llar_cds_dt,Lasso Least Angular Regressor w/ Cond. Deseaso...,0.924,0.747,57778685.476,66731499.136,0.035,0.035,-0.415,0.19
lar_cds_dt,Least Angular Regressor w/ Cond. Deseasonalize...,0.924,0.747,57778685.476,66731499.136,0.035,0.035,-0.415,0.207


In [73]:
plot_model(top_3, plot = 'forecast')

In [74]:
# Get model weights to use
top_model_metrics = metricas_completas.iloc[0:4]['R2']
display(top_model_metrics)

top_model_weights = 1 - top_model_metrics/top_model_metrics.sum()
display(top_model_weights)

arima           0.682
auto_arima      0.669
huber_cds_dt    0.177
omp_cds_dt     -0.186
Name: R2, dtype: object

arima          0.492
auto_arima     0.502
huber_cds_dt   0.868
omp_cds_dt     1.139
Name: R2, dtype: object

In [75]:
plot_model(top_3, plot = 'forecast', data_kwargs = {'fh': 9})

ValueError: X array dims (n_rows) != n_periods. Received n_rows=6 and n_periods=9

### Prediction

In [76]:
# predicciones
predicciones_modelo1 = predict_model(top_3[0], fh=  9)
predicciones_modelo2 = predict_model(top_3[1], fh=  9)
predicciones_modelo3 = predict_model(top_3[2], fh=  9)
predicciones = pd.DataFrame({metricas_completas.index[0]: np.squeeze(predicciones_modelo1.values),
                            metricas_completas.index[1]: np.squeeze(predicciones_modelo2.values),
                            metricas_completas.index[2]: np.squeeze(predicciones_modelo3.values)
                            },
                            index= predicciones_modelo1.index)

# predicciones[metricas_completas.index[0]] = (predicciones[metricas_completas.index[0]] * (1 - 0.2)).diff() + ventas_cluster["Suavizado"].iloc[0]
# predicciones[metricas_completas.index[1]] = (predicciones[metricas_completas.index[1]] * (1 - 0.2)).diff() + ventas_cluster["Suavizado"].iloc[0]
# predicciones[metricas_completas.index[2]] = (predicciones[metricas_completas.index[2]] * (1 - 0.2)).diff() + ventas_cluster["Suavizado"].iloc[0]

ValueError: X array dims (n_rows) != n_periods. Received n_rows=6 and n_periods=9

In [None]:
pd.options.display.float_format = '{:,.0f}'.format
predicciones

Unnamed: 0,auto_arima,huber_cds_dt,theta
2022-10,1752543472,1778684857,1778375343
2022-11,1780575227,1796900702,1787207133
2022-12,1789857658,1815035284,1796038923
2023-01,1808942582,1833100560,1804870714
2023-02,1822902582,1851106728,1813702504
2023-03,1839541987,1869062485,1822534294
2023-04,1854780549,1886975250,1831366084
2023-05,1870751498,1904851349,1840197875
2023-06,1886339542,1922696176,1849029665
