In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tslearn.clustering import TimeSeriesKMeans
from tslearn.preprocessing import TimeSeriesScalerMinMax
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Reshape, Bidirectional, LSTM, Dense
import joblib
import warnings

warnings.filterwarnings("ignore", category=UserWarning, module='keras')

# Paso 1: Cargar y preprocesar los datos
file_path = "C:/Users/Usuario/desktop/vero2/final_dataset_completo_con_ceros.csv"
df = pd.read_csv(file_path, sep='\t')

In [None]:
# Reemplazar 082019 por promedio 07 y 09
df['periodo'] = df['periodo'].astype(str).str.strip()
df_filtered = df[df['periodo'].isin(['201907', '201908', '201909'])]
pivoted_sales = df_filtered.pivot_table(index=['product_id', 'customer_id'], columns='periodo', values='tn').reset_index()
pivoted_sales = pivoted_sales.reindex(columns=['product_id', 'customer_id', '201907', '201908', '201909'])
pivoted_sales['201908'] = pivoted_sales[['201907', '201909']].mean(axis=1)
updated_sales = pivoted_sales.melt(id_vars=['product_id', 'customer_id'], value_vars=['201907', '201908', '201909'], var_name='periodo', value_name='tn')
df.set_index(['product_id', 'customer_id', 'periodo'], inplace=True)
df.update(updated_sales.set_index(['product_id', 'customer_id', 'periodo']))
df.reset_index(inplace=True)

# Aplicar LabelEncoder a las columnas categóricas
categorical_cols = ['cat1', 'cat2', 'cat3', 'brand', 'descripcion']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Agrupar las ventas por periodo, cat1, cat2, cat3, brand y customer_id
grouped_df = df.groupby(['periodo', 'cat1', 'cat2', 'cat3', 'brand', 'customer_id']).agg({'tn': 'sum'}).reset_index()

# Crear un diccionario para almacenar los scalers
scalers = {}
scaled_df = grouped_df.copy()

# Aplicar StandardScaler a la columna tn
scaler = StandardScaler()
scaled_df['tn'] = scaler.fit_transform(scaled_df[['tn']])
scalers['tn'] = scaler

# Guardar los scalers para su uso posterior
joblib.dump(scalers, 'scalers.pkl')

# Pivotear los datos escalados para clustering
pivot_df = scaled_df.pivot_table(index=['cat1', 'cat2', 'cat3', 'brand', 'customer_id'], columns='periodo', values='tn', fill_value=0)
pivot_df_values = pivot_df.values

# Determinar el número óptimo de clusters utilizando el diagrama de codo
distortions = []
K = range(6, 15)
for k in K:
    kmeans_model = TimeSeriesKMeans(n_clusters=k, metric="dtw", verbose=0, random_state=42)
    kmeans_model.fit(pivot_df_values)
    distortions.append(kmeans_model.inertia_)

# Visualizar el diagrama de codo
plt.figure(figsize=(10, 6))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:


# Elegir el número óptimo de clusters (e.g., 4 basado en el codo)
optimal_k = 15
kmeans_model = TimeSeriesKMeans(n_clusters=optimal_k, metric="dtw", verbose=0, random_state=42)
clusters = kmeans_model.fit_predict(pivot_df_values)

# Añadir el cluster al DataFrame original
pivot_df['cluster'] = clusters
grouped_df = pd.merge(grouped_df, pivot_df['cluster'].reset_index(), on=['cat1', 'cat2', 'cat3', 'brand', 'customer_id'])

# Paso 2: Calcular los ratios incluyendo customer_id
df['periodo'] = pd.to_datetime(df['periodo'], format='%Y%m')
df_diciembre_2019 = df[(df['periodo'].dt.year == 2019) & (df['periodo'].dt.month == 12)]
grouped_sales_2019 = df_diciembre_2019.groupby(['cat1', 'cat2', 'cat3', 'brand', 'customer_id', 'product_id'])['tn'].sum().reset_index()
group_totals_2019 = df_diciembre_2019.groupby(['cat1', 'cat2', 'cat3', 'brand', 'customer_id'])['tn'].sum().reset_index()
ratios_2019 = pd.merge(grouped_sales_2019, group_totals_2019, on=['cat1', 'cat2', 'cat3', 'brand', 'customer_id'], suffixes=('', '_total'))
ratios_2019['ratio'] = ratios_2019['tn'] / ratios_2019['tn_total']
ratio_dict = ratios_2019.set_index(['cat1', 'cat2', 'cat3', 'brand', 'customer_id', 'product_id'])['ratio'].to_dict()

# Paso 4: Creación de secuencias de tiempo
def crear_secuencias(data, n_steps, step_ahead=1):
    X, y = [], []
    for i in range(len(data) - n_steps - step_ahead):
        X.append(data[['tn']].iloc[i:i + n_steps].values)
        y.append(data['tn'].iloc[i + n_steps + step_ahead - 1])
    return np.array(X), np.array(y)

# Paso 5: Construcción y entrenamiento de los modelos LSTM
def build_lstm_model(input_shape):
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Reshape((input_shape[0] // 2, 64)))
    model.add(Bidirectional(LSTM(50, activation='relu')))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

def process_data_in_batches(scaled_df, n_steps, step_ahead=1):
    models = {}
    predictions = []

    for cluster, group_data in scaled_df.groupby('cluster'):
        group_data = group_data.sort_values(by='periodo')
        X, y = crear_secuencias(group_data, n_steps, step_ahead)

        if len(X) == 0 or len(y) == 0:
            continue

        model = build_lstm_model((X.shape[1], X.shape[2]))
        model.fit(X, y, epochs=100, verbose=0)

        model_key = cluster
        models[model_key] = model

        X_pred = group_data[['tn']].values[-(n_steps + step_ahead):-step_ahead].reshape((1, n_steps, 1))
        pred = model.predict(X_pred, verbose=0)
        pred_original_scale = scalers['tn'].inverse_transform(pred.reshape(-1, 1)).flatten()
        pred_original_scale = np.clip(pred_original_scale, 0, None)
        predictions.append([cluster, pred_original_scale[0]])

    return models, predictions

# Llamada a la función con el DataFrame escalado y clusterizado
n_steps = 13
step_ahead = 2
models, predictions = process_data_in_batches(scaled_df, n_steps, step_ahead)

# Convertir las predicciones a DataFrame
predictions_df = pd.DataFrame(predictions, columns=['cluster', 'prediction'])

# Aplicar los ratios para obtener las predicciones finales por product_id
final_predictions = []
for _, row in predictions_df.iterrows():
    cluster = row['cluster']
    for (cat1, cat2, cat3, brand, customer_id, product_id), ratio in ratio_dict.items():
        if cluster == clusters[pivot_df.index.get_loc((cat1, cat2, cat3, brand, customer_id))]:
            final_predictions.append([product_id, row['prediction'] * ratio])

final_predictions_df = pd.DataFrame(final_predictions, columns=['product_id', 'prediction'])

# Calcular el promedio de tn para los últimos 12 meses para cada product_id
last_12_months = df[df['periodo'] >= df['periodo'].max() - pd.DateOffset(months=12)]
average_tn_last_12_months = last_12_months.groupby('product_id')['tn'].mean().reset_index()

# Identificar los product_id faltantes en final_predictions_df
missing_product_ids = set(df['product_id']) - set(final_predictions_df['product_id'])
average_predictions = average_tn_last_12_months[average_tn_last_12_months['product_id'].isin(missing_product_ids)]

# Concatenar las predicciones promedio con las predicciones finales
complete_predictions = pd.concat([final_predictions_df, average_predictions], ignore_index=True)

# Guardar las predicciones finales
complete_predictions_df.to_csv("C:/Users/Usuario/desktop/vero2/predicciones_finales_DTWv2.csv", index=False)
