In [104]:
import pandas as pd # type: ignore
from sklearn.ensemble import RandomForestRegressor # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.metrics import root_mean_squared_error, r2_score # type: ignore

In [105]:
df_historic_order_demand = pd.read_excel('datos/raw_data/df_historic_order_demand.xlsx')

In [106]:
# No hay duplicados
df_historic_order_demand[df_historic_order_demand.duplicated()]

Unnamed: 0,cliente,mes_anio,order_demand


In [107]:
df_historic_order_demand.isna().sum()

cliente          0
mes_anio         0
order_demand    48
dtype: int64

In [108]:
df_historic_order_demand.dropna(inplace=True)

In [109]:
df_historic_order_demand.dtypes

cliente          object
mes_anio         object
order_demand    float64
dtype: object

In [110]:
#20 clientes
df_historic_order_demand["cliente"].unique()

array(['Cliente_1', 'Cliente_2', 'Cliente_3', 'Cliente_4', 'Cliente_5',
       'Cliente_6', 'Cliente_7', 'Cliente_8', 'Cliente_9', 'Cliente_10',
       'Cliente_11', 'Cliente_12', 'Cliente_13', 'Cliente_14',
       'Cliente_15', 'Cliente_16', 'Cliente_17', 'Cliente_18',
       'Cliente_19', 'Cliente_20'], dtype=object)

In [111]:
# Todos los valores son únicos
df_historic_order_demand["mes_anio"].unique

<bound method Series.unique of 0      12-2020
1      12-2020
2      12-2020
3      12-2020
4      12-2020
        ...   
975    12-2024
976    12-2024
977    12-2024
978    12-2024
979    12-2024
Name: mes_anio, Length: 932, dtype: object>

In [112]:
# Como la columna fecha son todo valores únicos, voy a descomponer la columna  en mes y año
df_historic_order_demand["anio"] = df_historic_order_demand["mes_anio"].str.split("-").str[1]
df_historic_order_demand["mes"] = df_historic_order_demand["mes_anio"].str.split("-").str[0]

In [113]:
# Hago una copia del dataframe
df_historico = df_historic_order_demand.copy()

In [114]:
# Como ya he obtenido lo que necesitaba, elimino la columna
df_historico.drop(columns=["mes_anio"], inplace = True)

In [115]:
# Convierto a numéricos
df_historico["anio"]=df_historico["anio"].astype(int)
df_historico["mes"]=df_historico["mes"].astype(int)

In [116]:
# Como la columna cliente es categórica, hago un get_dummies para que por cada cliente haga una columna numérica
df_historico = pd.get_dummies(df_historico, columns = ['cliente'], drop_first=True, dtype=int)

## Random Forest Regressor

In [117]:
X = df_historico.drop(columns=["order_demand"])  
y = df_historico["order_demand"]

In [118]:
# División de datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2517)

In [None]:
# Esto lo hago para encontrar el algoritmo con los mejores parámetros 
# El último que me salga será el mejor

n_arboles = [2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,50,100,500]
criterion = ["squared_error", "absolute_error", "friedman_mse", "poisson"]
division = [2,3,4,5] # Min samples split
num = 99999

for arbol in n_arboles:
    for criterio in criterion:
        for d in division:
            
            #n_estimators es el numero de arboles
            regr = RandomForestRegressor(n_estimators=arbol, criterion=criterio, max_depth=5, min_samples_split=d,random_state=2517)
            regr.fit(X_train, y_train)
            predicciones = regr.predict(X_test)
            rmse = root_mean_squared_error(y_test, predicciones)
            r2 = r2_score(y_test, predicciones)
            if rmse < num:
                print(f"""El random con:
                    criterio: {criterio}
                    n_arboles: {arbol},
                    min_samples_split: {d},
                    rmse: {rmse},
                    r2= {r2}
                              """)
                num = rmse
    
    

El random con:
                    criterio: squared_error
                    n_arboles: 2,
                    min_samples_split: 2,
                    rmse: 86.85211140205078,
                    r2= 0.6094354760634837
                              
El random con:
                    criterio: absolute_error
                    n_arboles: 2,
                    min_samples_split: 2,
                    rmse: 81.6952199165854,
                    r2= 0.654438521517525
                              


In [120]:
# Árbol con los mejores parametros 
regr = RandomForestRegressor(n_estimators=2, criterion="absolute_error", max_depth=5, min_samples_split=2,random_state=2517)
regr.fit(X_train, y_train)
predicciones = regr.predict(X_test)

## Prediccion enero 2025

In [126]:
# Nuevo dataframe con los datos de enero de 2025 y sus clientes
clientes = ["Cliente_1", "Cliente_2", "Cliente_3", "Cliente_4", "Cliente_5", "Cliente_6","Cliente_7", "Cliente_8", "Cliente_9",
            "Cliente_10", "Cliente_11", "Cliente_12","Cliente_13", "Cliente_14", "Cliente_15","Cliente_16", "Cliente_17", 
            "Cliente_18","Cliente_19","Cliente_20"]
df_prediccion = pd.DataFrame({"cliente":clientes})

In [127]:
# Defino las nuevas columnas del dataframe
df_prediccion["anio"]= 2025
df_prediccion["mes"]= 1
df_prediccion = pd.get_dummies(df_prediccion, columns = ['cliente'], drop_first=True, dtype=int)

In [128]:
# Hago la predicción
X_pred = df_prediccion  

predicciones_enero = regr.predict(X_pred)

df_prediccion["order_demand"] = predicciones_enero

df_prediccion['order_demand'] = df_prediccion['order_demand'].round(0)  



In [129]:
# Predicciones de enero de 2025
df_aux = df_prediccion.copy()

df_aux["cliente"] = clientes
df_aux[["cliente","anio","mes","order_demand"]]


Unnamed: 0,cliente,anio,mes,order_demand
0,Cliente_1,2025,1,732.0
1,Cliente_2,2025,1,726.0
2,Cliente_3,2025,1,732.0
3,Cliente_4,2025,1,757.0
4,Cliente_5,2025,1,732.0
5,Cliente_6,2025,1,732.0
6,Cliente_7,2025,1,713.0
7,Cliente_8,2025,1,732.0
8,Cliente_9,2025,1,732.0
9,Cliente_10,2025,1,732.0


In [130]:
# Para comparar con los resultados
df_historic_order_demand["order_demand"].describe()

count     932.000000
mean      700.636266
std       177.138557
min       555.000000
25%       601.000000
50%       641.000000
75%       768.000000
max      2776.000000
Name: order_demand, dtype: float64