In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import DeepFM
import torch

# --- Cargar datos ---
df = pd.read_pickle('/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/train_data.pkl')

# --- Preprocesamiento ---
print("\n--- Preprocesando datos ---")
df['date'] = pd.to_datetime(df['date']).astype(int) / 10**9
df['timestamp_local'] = pd.to_datetime(df['timestamp_local']).astype(int) / 10**9
df['hour'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.hour
df['day_of_week'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.dayofweek

# Calcular columnas faltantes
df['session_length'] = df.groupby('session_id')['partnumber'].transform('count')
df['country_popularity'] = df.groupby('country')['partnumber'].transform('count') / df['session_length']
df['product_interaction_rate'] = df.groupby('partnumber')['session_id'].transform('nunique') / df['session_length']

# Escalado de características numéricas
dense_features = ['hour', 'day_of_week', 'session_length', 'country_popularity', 'product_interaction_rate']
df[dense_features] = MinMaxScaler().fit_transform(df[dense_features])

# Codificación de características categóricas
sparse_features = ['country', 'partnumber', 'device_type']
for feat in sparse_features:
    df[feat] = LabelEncoder().fit_transform(df[feat])

# Definir columnas de características
fixlen_feature_columns = [
    SparseFeat(feat, vocabulary_size=df[feat].nunique(), embedding_dim=8)
    for feat in sparse_features
] + [DenseFeat(feat, 1) for feat in dense_features]

feature_names = get_feature_names(fixlen_feature_columns)

# Preparar datos para DeepFM
train, test = train_test_split(df, test_size=0.2, random_state=42)
train_model_input = {name: train[name].values for name in feature_names}
test_model_input = {name: test[name].values for name in feature_names}
y_train = train['add_to_cart'].values
y_test = test['add_to_cart'].values

# --- Construir y entrenar el modelo ---
print("\n--- Construyendo modelo DeepFM ---")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = DeepFM(
    linear_feature_columns=fixlen_feature_columns,
    dnn_feature_columns=fixlen_feature_columns,
    task='binary',
    device=device,
    l2_reg_embedding=1e-4,  # Regularización adicional
    l2_reg_linear=1e-3
)

# Configurar entrenamiento con menor tasa de aprendizaje
model.compile(
    optimizer=torch.optim.Adam(model.parameters(), lr=0.001),  # Tasa de aprendizaje ajustada
    loss="binary_crossentropy",
    metrics=["binary_crossentropy", "auc"]
)

# Entrenamiento
history = model.fit(
    train_model_input, y_train,
    batch_size=512, epochs=20, verbose=2, validation_split=0.2
)

# --- Evaluación ---
print("\n--- Evaluando el modelo ---")
loss, binary_crossentropy, auc = model.evaluate(test_model_input, y_test, batch_size=512)
print(f"Loss: {loss}, Binary Crossentropy: {binary_crossentropy}, AUC: {auc}")



2025-01-08 20:05:59.770055: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-08 20:05:59.925182: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736363159.978181    1283 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736363159.990486    1283 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-08 20:06:00.090345: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr


--- Preprocesando datos ---

--- Construyendo modelo DeepFM ---
cuda
Train on 29792924 samples, validate on 7448232 samples, 58190 steps per epoch
Epoch 1/20
503s - loss:  0.2145 - binary_crossentropy:  0.2121 - auc:  0.6828 - val_binary_crossentropy:  0.2105 - val_auc:  0.6904
Epoch 2/20
492s - loss:  0.2151 - binary_crossentropy:  0.2104 - auc:  0.6927 - val_binary_crossentropy:  0.2101 - val_auc:  0.6930
Epoch 3/20
489s - loss:  0.2162 - binary_crossentropy:  0.2099 - auc:  0.6953 - val_binary_crossentropy:  0.2098 - val_auc:  0.6938
Epoch 4/20
498s - loss:  0.2173 - binary_crossentropy:  0.2097 - auc:  0.6964 - val_binary_crossentropy:  0.2098 - val_auc:  0.6942
Epoch 5/20
492s - loss:  0.2181 - binary_crossentropy:  0.2096 - auc:  0.6969 - val_binary_crossentropy:  0.2097 - val_auc:  0.6945
Epoch 6/20
509s - loss:  0.2186 - binary_crossentropy:  0.2096 - auc:  0.6972 - val_binary_crossentropy:  0.2098 - val_auc:  0.6944
Epoch 7/20
510s - loss:  0.2191 - binary_crossentropy:  0.20

ValueError: not enough values to unpack (expected 3, got 2)

In [2]:
# Guardar el modelo DeepFM
model_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/deepfm_model.pth'
torch.save(model.state_dict(), model_path)
print(f"Modelo guardado en: {model_path}")


Modelo guardado en: /home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/deepfm_model.pth


In [None]:
# Cargar el modelo DeepFM
model.load_state_dict(torch.load(model_path))
model.to(device)  # Asegúrate de enviar el modelo al dispositivo adecuado (GPU o CPU)
model.eval()  # Configurar en modo de evaluación
print("Modelo cargado exitosamente.")


In [3]:
import json
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import torch

def generate_json_with_deepfm_strict(model, test_path, output_path):
    print("\n--- Cargando datos de prueba ---")
    test_df = pd.read_pickle(test_path)

    print("\n--- Preprocesando datos de prueba ---")
    # Preprocesar características temporales
    test_df['date'] = pd.to_datetime(test_df['date']).astype(int) / 10**9
    test_df['timestamp_local'] = pd.to_datetime(test_df['timestamp_local']).astype(int) / 10**9
    test_df['hour'] = pd.to_datetime(test_df['timestamp_local'], unit='s').dt.hour
    test_df['day_of_week'] = pd.to_datetime(test_df['timestamp_local'], unit='s').dt.dayofweek

    # Calcular características adicionales
    test_df['session_length'] = test_df.groupby('session_id')['partnumber'].transform('count')
    test_df['country_popularity'] = test_df.groupby('country')['partnumber'].transform('count') / test_df['session_length']
    test_df['product_interaction_rate'] = test_df.groupby('partnumber')['session_id'].transform('nunique') / test_df['session_length']

    # Definir características
    dense_features = ['hour', 'day_of_week', 'session_length', 'country_popularity', 'product_interaction_rate']
    sparse_features = ['country', 'partnumber', 'device_type']

    # Escalar características densas
    test_df[dense_features] = MinMaxScaler().fit_transform(test_df[dense_features])

    # Codificar características categóricas
    for feat in sparse_features:
        test_df[feat] = LabelEncoder().fit_transform(test_df[feat])

    feature_names = sparse_features + dense_features

    print("\n--- Preparando entradas del modelo ---")
    test_model_input = {name: test_df[name].values for name in feature_names}

    print("\n--- Generando predicciones para cada sesión ---")
    test_df['score'] = model.predict(test_model_input, batch_size=512)
    session_ids = test_df['session_id'].unique()
    predictions = {}

    for session_id in session_ids:
        session_data = test_df[test_df['session_id'] == session_id]

        if session_data.empty:
            predictions[str(session_id)] = []
            continue

        # Ordenar productos por puntuación y seleccionar los mejores 5
        recommended_products = (
            session_data.sort_values(by='score', ascending=False)['partnumber']
            .drop_duplicates()
            .tolist()
        )

        # Garantizar exactamente 5 valores únicos
        if len(recommended_products) < 5:
            popular_products = test_df['partnumber'].value_counts().index.tolist()
            for product in popular_products:
                if len(recommended_products) >= 5:
                    break
                if product not in recommended_products:
                    recommended_products.append(product)

        predictions[str(session_id)] = recommended_products[:5]

    print("\n--- Guardando predicciones en JSON ---")
    output = {"target": predictions}
    with open(output_path, 'w') as f:
        json.dump(output, f, indent=4)

    print(f"Archivo predictions_3.json generado exitosamente en {output_path}.")

# Ruta del modelo y datos de prueba
model_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/deepfm_model.pth'
test_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/test_data.pkl'
output_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_3_deepfm_v2.json'

# Cargar modelo y generar JSON
print("\n--- Cargando modelo DeepFM ---")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.load_state_dict(torch.load(model_path))
model.to(device)
model.eval()

generate_json_with_deepfm_strict(model, test_path, output_path)



--- Cargando modelo DeepFM ---

--- Cargando datos de prueba ---

--- Preprocesando datos de prueba ---

--- Preparando entradas del modelo ---

--- Generando predicciones para cada sesión ---


  model.load_state_dict(torch.load(model_path))



--- Guardando predicciones en JSON ---
Archivo predictions_3.json generado exitosamente en /home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_3_deepfm_v2.json.
