In [42]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from lightgbm import LGBMRanker # Import LGBMRanker

In [43]:
train=pd.read_parquet('../Data/small_train_engineered.parquet')
val=pd.read_parquet('../Data/small_val_engineered.parquet')

In [46]:
# --- YOUR HACKATHON METRIC FUNCTION ---
def map_at_k(y_true, y_pred_scores, group_ids, k=7):
    """
    Calculates the Mean Average Precision at k.
    """
    df = pd.DataFrame({'group': group_ids, 'y_true': y_true, 'score': y_pred_scores})
    
    average_precisions = []
    total_relevant_per_group = df[df['y_true'] == 1].groupby('group')['y_true'].count()

    for group_id, group_df in df.groupby('group'):
        total_relevant = total_relevant_per_group.get(group_id, 0)
        if total_relevant == 0:
            continue

        group_df = group_df.sort_values('score', ascending=False).head(k)
        
        hits = 0
        precision_sum = 0.0
        
        for i, row in enumerate(group_df.itertuples(index=False)):
            rank = i + 1
            if row.y_true == 1:
                hits += 1
                precision_at_k = hits / rank
                precision_sum += precision_at_k
        
        ap = precision_sum / total_relevant
        average_precisions.append(ap)

    return np.mean(average_precisions) if average_precisions else 0.0


In [55]:

# --- WRAPPER FUNCTION FOR LIGHTGBM ---
def lgb_map_at_k_factory(group_ids, k=7):
    """
    This factory creates the metric function LightGBM needs.
    """
    def lgb_map_at_k(y_true, y_pred):
        # Call your main metric function with all required parts
        score = map_at_k(
            y_true=y_true,
            y_pred_scores=y_pred,
            group_ids=group_ids,
            k=k
        )
        # The return format is (metric_name, value, is_higher_better)
        return 'map@k', score, True
        
    return lgb_map_at_k


In [56]:
# --- DATA PREPARATION FOR RANKING ---
TARGET = 'y'
FEATURES = [col for col in train.columns if col not in [TARGET, 'id2', 'id3', 'id4','id5']]


In [57]:

# STEP 1: SORT THE DATA (CRITICAL!)
# The data must be sorted by the grouping key.
train = train.sort_values('id2').reset_index(drop=True)
val = val.sort_values('id2').reset_index(drop=True)


In [58]:

# STEP 2: CREATE THE GROUP ARRAYS
# This tells LightGBM how many items are in each user's list.
train_groups = train.groupby('id2').size().to_numpy()
val_groups = val.groupby('id2').size().to_numpy()


  train_groups = train.groupby('id2').size().to_numpy()
  val_groups = val.groupby('id2').size().to_numpy()


In [59]:

# Now, create your X and y sets from the sorted data
X_train = train[FEATURES]
y_train = train[TARGET].astype(int) # Ensure labels are integers

X_val = val[FEATURES]
y_val = val[TARGET].astype(int)


In [60]:


# --- MODEL TRAINING ---
# The metric factory is the same, but we will use the sorted 'val' user IDs
eval_metric_function = lgb_map_at_k_factory(group_ids=val['id2'], k=7)


In [61]:
model_ranker = LGBMRanker(
    objective='lambdarank',
    n_estimators=2000,
    learning_rate=0.05,
    n_jobs=-1,
    random_state=42
)

In [62]:
model_ranker.fit(
    X_train, y_train,
    group=train_groups,
    eval_set=[(X_val, y_val)],
    eval_group=[val_groups],
    eval_metric=eval_metric_function,
    callbacks=[lgb.early_stopping(50, verbose=True)]
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023296 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 39911
[LightGBM] [Info] Number of data points in the train set: 124378, number of used features: 233


  total_relevant_per_group = df[df['y_true'] == 1].groupby('group')['y_true'].count()
  for group_id, group_df in df.groupby('group'):


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[63]	valid_0's ndcg@1: 0.980524	valid_0's ndcg@2: 0.981389	valid_0's ndcg@3: 0.982054	valid_0's ndcg@4: 0.98281	valid_0's ndcg@5: 0.98319	valid_0's map@k: 0.487044


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,2000
,subsample_for_bin,200000
,objective,'lambdarank'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [63]:
print("\nRanking model training complete!")
print(f"Best MAP@k on validation set: {model_ranker.best_score_['valid_0']['map@k']:.4f}")


Ranking model training complete!
Best MAP@k on validation set: 0.4870
