# MDI - MDA - Feature importance

In [1]:
import pandas as pd
import json

test_data_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed_v2/test_optimized.parquet'
train_data_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed_v2/train_optimized.parquet'

# Cargar los datasets
test_df = pd.read_parquet(test_data_path)
train_df = pd.read_parquet(train_data_path)

## Tratamiento de outliers

In [6]:
import numpy as np

# 1. Identificar y tratar sesiones extremadamente largas
session_lengths_train = train_df.groupby('session_id').size()
session_lengths_test = test_df.groupby('session_id').size()

# Calcular percentiles para identificar umbrales
train_threshold = np.percentile(session_lengths_train, 99)
test_threshold = np.percentile(session_lengths_test, 99)

# Recortar sesiones largas (99 percentil)
train_df = train_df[train_df['session_id'].map(session_lengths_train) <= train_threshold]
test_df = test_df[test_df['session_id'].map(session_lengths_test) <= test_threshold]

# 2. Tratar valores extremos en pagetype
# Reemplazar valores extremos en pagetype por una categoría "Otros"
pagetype_threshold = 26  # Limitar a valores <= 26
train_df['pagetype'] = np.where(train_df['pagetype'] > pagetype_threshold, -1, train_df['pagetype'])
test_df['pagetype'] = np.where(test_df['pagetype'] > pagetype_threshold, -1, test_df['pagetype'])

# Verificar cambios
print("Distribución de pagetype después de recorte:")
print(train_df['pagetype'].value_counts())
print(test_df['pagetype'].value_counts())


Distribución de pagetype después de recorte:
pagetype
 24    38213622
 8       123994
 6       104779
 19       17934
 16       14732
 7         7759
 1         3411
 17        2018
-1         1077
 25         393
 20         280
 23         268
 10         225
 3          179
 5          151
 21         133
 12          61
 13          51
 26          42
 14          33
 9           28
 11          27
 22          12
 15          10
 2            5
 4            1
 18           1
Name: count, dtype: int64
pagetype
24    27683
8       838
6       535
16       73
7        67
19       65
17        6
1         5
25        2
26        1
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['pagetype'] = np.where(train_df['pagetype'] > pagetype_threshold, -1, train_df['pagetype'])


## Realizar análisis inicial de importancia de características - MDI - MDA

In [3]:
import lightgbm as lgb
print(lgb.__version__)

4.5.0.99


### Modelo base para calcular MDI - MDA - Feature importance

In [4]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Preparar los datos para el modelo
features = ['user_id', 'country', 'partnumber', 'device_type', 'pagetype', 'session_id']

# Convertir fechas a características numéricas
train_df['date_unix'] = train_df['date'].astype('int64') // 10**9
train_df['timestamp_unix'] = train_df['timestamp_local'].astype('int64') // 10**9
test_df['date_unix'] = test_df['date'].astype('int64') // 10**9
test_df['timestamp_unix'] = test_df['timestamp_local'].astype('int64') // 10**9

# Agregar las nuevas columnas al conjunto de características
features += ['date_unix', 'timestamp_unix']

X = train_df[features]
y = train_df['add_to_cart']

# División del dataset (train-test split)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Configuración de LightGBM para GPU
lgb_params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'binary_logloss',
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    'n_estimators': 100,
    'random_state': 42
}

# Entrenar el modelo de LightGBM
lgb_model = lgb.LGBMClassifier(**lgb_params)
lgb_model.fit(X_train, y_train)

# Evaluar el modelo
y_pred = lgb_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy del modelo LightGBM: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_val, y_pred))

# Importancia de características (MDI)
mdi_importances = lgb_model.feature_importances_
print("\nImportancia de características (MDI):")
for feature, importance in zip(features, mdi_importances):
    print(f"MDI Importancia de {feature}: {importance:.4f}")

# Cálculo de MDA (Permutación)
from sklearn.inspection import permutation_importance

# Cálculo de importancia por permutación (MDA)
print("\nCalculando MDA...")
perm_importances = permutation_importance(
    lgb_model, X_val, y_val, scoring='accuracy', n_repeats=5, random_state=42
)

# Mostrar los resultados de MDA
print("\nImportancia de características (MDA):")
for feature, importance in zip(features, perm_importances.importances_mean):
    print(f"MDA Importancia de {feature}: {importance:.4f}")


[LightGBM] [Info] Number of positive: 1992816, number of negative: 28800164
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1058
[LightGBM] [Info] Number of data points in the train set: 30792980, number of used features: 8
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: pthread-13th Gen Intel(R) Core(TM) i5-13600KF, Vendor: GenuineIntel
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 5 dense feature groups (234.93 MB) transferred to GPU in 0.116246 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064717 -> initscore=-2.670832
[LightGBM] [Info] Start training from score -2.670832
Accuracy del modelo LightGBM: 0.94

Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97   7200763
           1  