In [3]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split

# ==============================
# 1. Carregar e preparar os dados
# ==============================
df = pd.read_csv("Data/DataProcessed.csv").sort_values("AccountId")
df['OrderClassification'] = df['OrderClassification'].fillna(0).astype(int)

In [4]:
df['ProductLine'] = df['ProductLine'].astype('category')
df['Size'] = df['Size'].astype('category')

In [11]:
# Label: HasOrder (0, 1, 2)
y = df["OrderClassification"]

# # Features atuais + possibilidade de novas features
# feature_cols = [
#     #'TotalPriceCurrentMonth',
#     'TotalPrice3Month',
#     'TotalPrice6Month',
#     'TotalPriceCurrentYear',
#     'NetValue',
#     'Quantity',
#     #'TotalQuantityCurrentMonth',
#     'TotalQuantity3Month',
#     'TotalQuantity6Month',
#     'TotalQuantityCurrentYear',
#     'AverageTicket',
#     #'AverageTicketCurrentMonth',
#     'AverageTicket3Month',
#     'AverageTicket6Month',
#     'AverageTicketCurrentYear',
#     #'Recency',
#     'Weight',
#     'ProductLine',
#     'Size',
#     'ProductPopularity',
#     'TotalPriceByAccount',
#     'TotalQuantityByAccount',
#     'RelativePriceProduct',
#     'RelativePriceAccount'
# ]
feature_cols = [
    'NetValue', 
    'Quantity',
    'Recency', 
    'Weight', 
    'ProductLine', 
    'Size', 
    'TotalPrice3Month',
    'TotalPrice6Month', 
    'TotalQuantity3Month', 
    'TotalQuantity6Month',
    'AverageTicket', 
    'AverageTicket3Month',
    'AverageTicket6Month', 
    'ProductPopularity', 
    'TotalPriceByAccount',
    'TotalQuantityByAccount', 
    'RelativePriceProduct',
    'RelativePriceAccount'
]

X = df[feature_cols]

In [12]:
# ==============================
# 2. Split por AccountId (sem vazar info)
# ==============================
unique_accounts = df["AccountId"].unique()
train_ids, test_ids = train_test_split(unique_accounts, test_size=0.2, random_state=42)

train_df = df[df["AccountId"].isin(train_ids)]
test_df  = df[df["AccountId"].isin(test_ids)]

X_train, y_train = train_df[feature_cols], train_df["OrderClassification"].astype(int)
X_test, y_test   = test_df[feature_cols],  test_df["OrderClassification"].astype(int)

# Grupos por conta (cada cliente é um grupo de produtos)
train_groups = train_df.groupby("AccountId").size().to_list()
test_groups  = test_df.groupby("AccountId").size().to_list()

train_data = lgb.Dataset(X_train, label=y_train, group=train_groups)
test_data  = lgb.Dataset(X_test, label=y_test, group=test_groups, reference=train_data)

In [13]:
# ==============================
# 3. Hiperparâmetros otimizados
# ==============================
params = {
    "objective": "lambdarank",
    "metric": "ndcg",
    "ndcg_eval_at": [5, 10],   # Avaliação no top-5 e top-10
    "learning_rate": 0.01,
    "num_leaves": 1024,
    #"min_data_in_leaf": 20, #Valor Mínimo de dados por folha, para evitar overfitting
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    #'early_stopping_rounds':2
}

In [None]:
# ==============================
# 4. Treinamento
# ==============================
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, test_data],
    valid_names=["train", "valid"],
    num_boost_round=2000,
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004297 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3243
[LightGBM] [Info] Number of data points in the train set: 75741, number of used features: 18


In [None]:
# ==============================
# 5. Predição e Ranking em um cliente específico
# ==============================
account_id = '001U400000QqZScIAN'

X_new = df[df['Id'] == account_id][feature_cols]
product_ids = df[df['Id'] == account_id]['ProductCode'].to_list()

scores = model.predict(X_new)

ranking = sorted(zip(product_ids, scores), key=lambda x: x[1], reverse=True)

print(f"\nRanking de produtos recomendados para Account {account_id}:")
for prod, score in ranking:
    print(f"Produto {prod} - score {score:.4f}")

In [None]:
# =========================================================
# 6. Avaliação completa no conjunto de teste
# =========================================================
from sklearn.metrics import ndcg_score

# Fazer predições para todo o conjunto de teste
test_predictions = model.predict(X_test)

# Adicionar as predições de volta ao dataframe de teste para facilitar o agrupamento
test_df['predictions'] = test_predictions

# Lista para armazenar o NDCG de cada grupo (cliente)
ndcg_scores_k5 = []
ndcg_scores_k10 = []

# Iterar sobre cada cliente no conjunto de teste
for account_id in test_df['Id'].unique():
    # Filtrar os dados do cliente atual
    client_data = test_df[test_df['Id'] == account_id]
    
    # Pegar os scores de relevância verdadeiros e os previstos
    true_relevance = client_data['OrderClassification'].values.reshape(1, -1)
    predicted_scores = client_data['predictions'].values.reshape(1, -1)
    
    # Ignorar clientes com apenas itens irrelevantes (NDCG é indefinido)
    if np.sum(true_relevance) > 0:
        # Calcular NDCG @ 5 e @ 10
        ndcg_k5 = ndcg_score(true_relevance, predicted_scores, k=5)
        ndcg_k10 = ndcg_score(true_relevance, predicted_scores, k=10)
        
        ndcg_scores_k5.append(ndcg_k5)
        ndcg_scores_k10.append(ndcg_k10)

# Calcular a média final do NDCG
mean_ndcg_k5 = np.mean(ndcg_scores_k5)
mean_ndcg_k10 = np.mean(ndcg_scores_k10)

print("\n========================================")
print("Avaliação Final no Conjunto de Teste")
print(f"NDCG médio @ 5: {mean_ndcg_k5:.4f}")
print(f"NDCG médio @ 10: {mean_ndcg_k10:.4f}")
print("========================================")

In [None]:
import matplotlib.pyplot as plt
import shap

# ==============================
# 1. Importância das Features
# ==============================
lgb.plot_importance(model, max_num_features=15, importance_type="gain")
plt.title("Importância das Features (LightGBM)")
plt.show()

# ==============================
# 2. Explicação com SHAP
# ==============================
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Gráfico resumido: mostra as variáveis que mais influenciam no ranking
shap.summary_plot(shap_values, X_test, plot_type="bar")