In [1]:
import pyodbc
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import MaxAbsScaler

import warnings
warnings.filterwarnings("ignore")

from pycaret.time_series import *
from datetime import datetime

##### Funcion para descargar los datos y EDA del cluster

In [109]:
def df_cluster(nits_clientes, fecha_final):
    # Conexion al dwh
    cnxn = pyodbc.connect(
        driver='{SQL Server}',
        server='192.168.100.58',
        uid='bilectura',
        pwd='D1sp@p3l3s')
    cursor = cnxn.cursor()

    df_SQL_nits = pd.DataFrame()

    for nit in nits_clientes:
        #Consulta SQL
        # consulta_SQL = f"SELECT DATEFROMPARTS(VTAANO, VTAMES, 1) AS 'Fecha', CONCAT(CONCAT(VTANIT, '-'), VTASUC) AS 'Nitcliente-sucursal', SUM(VTAVLRVTA) AS 'Ventas' FROM V_VTA_VTAHEC WHERE CONCAT(CONCAT(VTANIT, '-'), VTASUC) = '{nit}' AND VTAFCH BETWEEN '2021-01-01' AND '{fecha_final}' GROUP BY DATEFROMPARTS(VTAANO, VTAMES, 1), CONCAT(CONCAT(VTANIT, '-'), VTASUC)"
        consulta_SQL = f"SELECT DATEFROMPARTS(VTAANO, VTAMES, 1) AS 'Fecha', CONCAT(CONCAT(VTANIT, '-'), VTASUC) AS 'Nitcliente-sucursal', SUM(VTAVLRVTA) AS 'Ventas' FROM V_VTA_VTAHEC WHERE CONCAT(CONCAT(VTANIT, '-'), VTASUC) = '{nit}' AND VTAFCH < '{fecha_final}' GROUP BY DATEFROMPARTS(VTAANO, VTAMES, 1), CONCAT(CONCAT(VTANIT, '-'), VTASUC)"

        #Carga de la data desde el dwh de Dispapeles y se guarda en df
        cursor.execute(consulta_SQL)
        rows = cursor.fetchall()
        df_SQL_int = pd.DataFrame.from_records(rows, columns=[col[0] for col in cursor.description])
        df_SQL_int["Ventas"] = df_SQL_int["Ventas"].astype(int)
        df_SQL_int["Fecha"] = pd.to_datetime(df_SQL_int["Fecha"])

        df_SQL_nits = pd.concat([df_SQL_nits, df_SQL_int], ignore_index= True)

    df_SQL = df_SQL_nits.groupby("Fecha").sum().reset_index()
    df_SQL_nits = df_SQL_nits.groupby("Nitcliente-sucursal").sum().reset_index()

    return df_SQL, df_SQL_nits

In [110]:
def lineplot(bd):
    x = bd["Fecha"]
    x_n = np.arange(0, len(bd))
    y = bd["Ventas"]   
    coeficientes = np.polyfit(x_n, y, 1)
    poli = np.poly1d(coeficientes)

    trace1 = go.Scatter(x=x, y=y, mode='lines+markers', name='Ventas')
    trace2 = go.Scatter(x=x, y=poli(x_n), mode='lines', name='Línea de Tendencia')

    layout = go.Layout(
            title='Ventas por mes',
            xaxis=dict(title='Fecha'),
            yaxis=dict(title='Ventas'),
            legend=dict(x=1, y=1)
    )

    fig = go.Figure()
    fig.add_trace(trace1)
    fig.add_trace(trace2)
    fig.update_layout(layout)
    fig.show()

def EDA_cluster(bd, bd_nits):
    #Variables
    bd = bd.reset_index()
    bd_nits = bd_nits
    primer_fecha = datetime.utcfromtimestamp(bd.iloc[0, 1].timestamp())
    ultima_fecha = datetime.utcfromtimestamp(bd.iloc[-1, 1].timestamp())
    describe_bd = bd.describe().applymap("{:,.0f}".format)
    describe_bd_nits = bd_nits.describe().applymap("{:,.0f}".format)
    
    print(f"Esta base de datos tiene ventas de {len(bd)} meses,")
    print(f"empezando desde el {primer_fecha.strftime('%d-%m-%Y')}")
    print(f"y terminando el {ultima_fecha.strftime('%d-%m-%Y')}")
    print("La composicion estadistica de la base de datos es la siguiente:")
    print(describe_bd["Ventas"][1:])
    print(" ")

    print(f"Por otro lado, esta compuesto por ventas de {len(bd_nits)} clientes")
    print("Y asi se comporta estadisticamente asi:")
    print(describe_bd_nits["Ventas"][1:])
    print(" ")


    lineplot(bd)

##### Carga de los clusters

In [111]:
df_clusters = pd.read_csv("C:/Users/tcardenas/OneDrive/OneDrive - Grupo DISPAPELES/Documents/ML-Dispapeles-TomasCaLo/Clustering/Clustering 12-04-23.csv",
                            encoding= 'utf-8', decimal= ",", sep= ";")
col_eliminar = ["Escala R", "Escala M", "Escala F", "Distrito-Nombretipozona", "Cluster"]
df_clusters = df_clusters.drop(col_eliminar, axis= 1)

#El mejor modelo es 10-Institucional-A
filtro_distrito = 10
filtro_tipozona = "Institucional"
filtro_cluster = "A"

df_clusters_f = df_clusters[
                            (df_clusters["Codigo distrito"] == filtro_distrito) &
                            (df_clusters["Nombre tipo zona"] == filtro_tipozona) &
                            (df_clusters["Letra cluster"] == filtro_cluster)
                            ]

##### EDA del cluster elegido

In [112]:
df_clusters_EDA = df_clusters.groupby(["Codigo distrito", "Nombre tipo zona", "Letra cluster"]).agg({"Nit cliente-sucursal": np.size}).reset_index()

In [113]:
df_clusters_EDA

Unnamed: 0,Codigo distrito,Nombre tipo zona,Letra cluster,Nit cliente-sucursal
0,10,Artes graficas,A,145
1,10,Artes graficas,B,94
2,10,Artes graficas,C,120
3,10,Artes graficas,D,79
4,10,Artes graficas,E,85
...,...,...,...,...
330,90,Mayorista,A,14
331,90,Mayorista,B,23
332,90,Mayorista,C,33
333,90,Mayorista,D,40


In [114]:
lista_nits = df_clusters_f["Nit cliente-sucursal"].tolist()
fecha_final = '2023-03-31'
fecha_final = datetime.strptime(fecha_final, '%Y-%m-%d').strftime('%Y-%m-%d')

In [115]:
ventas_cluster, ventas_nits = df_cluster(nits_clientes= lista_nits, fecha_final= fecha_final)

In [265]:
EDA_cluster(ventas_cluster, ventas_nits)

Esta base de datos tiene ventas de 63 meses,
empezando desde el 01-01-2018
y terminando el 01-03-2023
La composicion estadistica de la base de datos es la siguiente:
mean    1,398,473,915
std       383,494,469
min       589,893,056
25%     1,155,690,522
50%     1,409,209,637
75%     1,638,192,058
max     2,221,545,582
Name: Ventas, dtype: object
 
Por otro lado, esta compuesto por ventas de 221 clientes
Y asi se comporta estadisticamente asi:
mean       398,659,985
std      1,867,746,054
min          1,273,370
25%         11,126,108
50%         26,961,701
75%         98,284,865
max     24,380,149,839
Name: Ventas, dtype: object
 


##### Pruebas

In [117]:
fig_kwargs = {
    # "renderer": "notebook",
    "renderer": "png",
    "width": 1000,
    "height": 600,}

In [118]:
setup_ventas = setup(
            ventas_cluster, #df
            target= "Ventas",
            ignore_features= "Suavizado",
            index= "Fecha",
            session_id = 42, #id para mantener replicabilidad
            transform_target= None, #transformador del target, "box-cox", "log", "sqrt", "exp", "cos"
            coverage= 0.9, #intervalos
            fh = 6,
            use_gpu= True,
            verbose= True,
            hyperparameter_split= 'all' # all or train
            # seasonal_period= 'Q'
            )

In [266]:
setup_ventas.plot_model(plot= "acf")

In [267]:
setup_ventas.plot_model(plot= "pacf", data_kwargs= {"nlags": 12})

In [121]:
setup_ventas.plot_model(
    plot="diff",
    data_kwargs={"lags_list": [[1], [1, 3]], "acf": True, "pacf": True, "periodogram": True},
    fig_kwargs={"height": 800, "width": 1900}
)

In [122]:
setup_ventas.check_stats(test='stationarity')

Unnamed: 0,Test,Test Name,Data,Property,Setting,Value
0,Stationarity,ADF,Transformed,Stationarity,{'alpha': 0.05},False
1,Stationarity,ADF,Transformed,p-value,{'alpha': 0.05},1
2,Stationarity,ADF,Transformed,Test Statistic,{'alpha': 0.05},-1
3,Stationarity,ADF,Transformed,Critical Value 1%,{'alpha': 0.05},-4
4,Stationarity,ADF,Transformed,Critical Value 5%,{'alpha': 0.05},-3
5,Stationarity,ADF,Transformed,Critical Value 10%,{'alpha': 0.05},-3
6,Stationarity,KPSS,Transformed,Trend Stationarity,{'alpha': 0.05},True
7,Stationarity,KPSS,Transformed,p-value,{'alpha': 0.05},0
8,Stationarity,KPSS,Transformed,Test Statistic,{'alpha': 0.05},0
9,Stationarity,KPSS,Transformed,Critical Value 10%,{'alpha': 0.05},0


In [123]:
# Aplicar suavización exponencial simple
alpha = 0.25
ventas_cluster["Suavizado"] = ventas_cluster["Ventas"].ewm(alpha=alpha).mean() 

In [268]:
# Visualizar la serie original y suavizada
x = ventas_cluster["Fecha"]
y1 = ventas_cluster["Ventas"]
y2 = ventas_cluster["Suavizado"]
trace1 = go.Scatter(x= x, y= y1, mode= "lines+markers", name= "Ventas")
trace2 = go.Scatter(x= x, y= y2, mode= "lines+markers", name= "Suavizado")

layout = go.Layout(
        title='Ventas por mes',
        xaxis=dict(title='Fecha'),
        yaxis=dict(title='Ventas'),
        legend=dict(x=1, y=1)
        )
fig = go.Figure()
fig.add_trace(trace1)
fig.add_trace(trace2)
fig.update_layout(layout)
fig.show()

##### Configuracion y prediccion

In [260]:
setup_suavizado = setup(
            ventas_cluster, #df
            target= "Suavizado",
            ignore_features= "Ventas",
            index= "Fecha",
            session_id = 42, #id para mantener replicabilidad
            transform_target= None, #transformador del target, "box-cox", "log", "sqrt", "exp", "cos"
            coverage= 0.9, #intervalos
            fh = 6,
            use_gpu= True,
            verbose= True,
            hyperparameter_split= 'train', #all or train
            # seasonal_period= 'Q',
            )

In [261]:
# remove_metric('RMSE')
# remove_metric('RMSSE')
# remove_metric('SMAPE')
# remove_metric('MAE')
top_models = compare_models(
                        n_select= 5,
                        sort= "r2"
                        )
metricas_completas = pull()

In [262]:
# Establecer el formato de visualización de números en notación decimal
pd.options.display.float_format = '{:,.3f}'.format
metricas_completas.loc[metricas_completas["R2"] > 0]
# metricas_completas[:3]

Unnamed: 0,Model,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2,TT (Sec)
auto_arima,Auto ARIMA,0.748,0.674,30501686.431,41389085.204,0.018,0.018,0.328,0.763
huber_cds_dt,Huber w/ Cond. Deseasonalize & Detrending,0.779,0.664,32261371.366,41407870.539,0.02,0.02,0.158,0.173
theta,Theta Forecaster,0.914,0.779,37316838.689,47990990.467,0.022,0.022,0.075,0.043


### Prediction

In [269]:
plot_model(top_models, plot = 'forecast', data_kwargs = {'fh': 9})

In [264]:
pd.options.display.float_format = '{:,.0f}'.format
# final_model = setup_suavizado.finalize_model(top_models[0])
# print(setup_suavizado.predict_model(final_model))
# setup_suavizado.plot_model(final_model)

final_model = setup_ventas.finalize_model(top_models[0])
print(setup_ventas.predict_model(final_model))
setup_ventas.plot_model(final_model)

               y_pred
2023-04 2,011,459,694
2023-05 1,891,669,883
2023-06 2,015,338,348
2023-07 2,087,894,584
2023-08 2,054,776,782
2023-09 2,076,240,791
