In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd


In [2]:
train=pd.read_parquet('../Data/small_train_engineered.parquet')
val=pd.read_parquet('../Data/small_val_engineered.parquet')

In [3]:
train.dtypes

id2                                 category
id3                                 category
id4                           datetime64[ns]
id5                           datetime64[ns]
y                                       int8
                                   ...      
num_offer_categories                   int64
sub_category                        category
num_sub_categories                     int64
previous_offer_category             category
previous_suboffer_category          category
Length: 240, dtype: object

In [4]:

# --- YOUR HACKATHON METRIC FUNCTION ---
def map_at_k(y_true, y_pred_scores, group_ids, k=7):
    """
    Calculates the Mean Average Precision at k.
    """
    df = pd.DataFrame({'group': group_ids, 'y_true': y_true, 'score': y_pred_scores})
    
    average_precisions = []
    total_relevant_per_group = df[df['y_true'] == 1].groupby('group')['y_true'].count()

    for group_id, group_df in df.groupby('group'):
        total_relevant = total_relevant_per_group.get(group_id, 0)
        if total_relevant == 0:
            continue

        group_df = group_df.sort_values('score', ascending=False).head(k)
        
        hits = 0
        precision_sum = 0.0
        
        for i, row in enumerate(group_df.itertuples(index=False)):
            rank = i + 1
            if row.y_true == 1:
                hits += 1
                precision_at_k = hits / rank
                precision_sum += precision_at_k
        
        ap = precision_sum / total_relevant
        average_precisions.append(ap)

    return np.mean(average_precisions) if average_precisions else 0.0


In [5]:

# --- WRAPPER FUNCTION FOR LIGHTGBM ---
def lgb_map_at_k_factory(group_ids, k=7):
    """
    This factory creates the metric function LightGBM needs.
    """
    def lgb_map_at_k(y_true, y_pred):
        # Call your main metric function with all required parts
        score = map_at_k(
            y_true=y_true,
            y_pred_scores=y_pred,
            group_ids=group_ids,
            k=k
        )
        # The return format is (metric_name, value, is_higher_better)
        return 'map@k', score, True
        
    return lgb_map_at_k


In [6]:

# --- DATA PREPARATION (Assuming it's already done) ---
TARGET = 'y'
FEATURES = [col for col in train.columns if col not in [TARGET, 'id2', 'id3','id4','id5']]

X_train = train[FEATURES]
y_train = train[TARGET]
X_val = val[FEATURES]
y_val = val[TARGET]

In [7]:

# --- MODEL TRAINING ---
# Create the specific metric function for our validation set using the factory
eval_metric_function = lgb_map_at_k_factory(group_ids=val['id2'], k=7)

model = lgb.LGBMClassifier(
    objective='binary',
    n_estimators=2000,
    learning_rate=0.05,
    n_jobs=-1,
    random_state=42,
    is_unbalance=True
)


In [8]:

# Train the model using your exact metric for evaluation
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric=eval_metric_function,
    callbacks=[lgb.early_stopping(50, verbose=True)]
)

print("\nTraining complete!")
print(f"Best MAP@k on validation set: {model.best_score_['valid_0']['map@k']:.4f}")

[LightGBM] [Info] Number of positive: 8907, number of negative: 115471
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021862 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 39911
[LightGBM] [Info] Number of data points in the train set: 124378, number of used features: 233
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.071612 -> initscore=-2.562182
[LightGBM] [Info] Start training from score -2.562182


  total_relevant_per_group = df[df['y_true'] == 1].groupby('group')['y_true'].count()
  for group_id, group_df in df.groupby('group'):


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[14]	valid_0's binary_logloss: 0.128495	valid_0's map@k: 0.574902

Training complete!
Best MAP@k on validation set: 0.5749


# Trying with the new Dataset

In [10]:
train2=pd.read_parquet('../Data/small_train_feature_engineered_2.parquet',engine='pyarrow')
val2=pd.read_parquet('../Data/small_val_feature_engineered_2.parquet',engine='pyarrow')

In [11]:
# --- DATA PREPARATION (Assuming it's already done) ---
TARGET2 = 'y'
FEATURES2 = [col for col in train2.columns if col not in [TARGET2, 'id2', 'id3','id4','id5']]

X_train_2 = train2[FEATURES2]
y_train_2 = train2[TARGET2]
X_val_2 = val2[FEATURES2]
y_val_2 = val2[TARGET2]

In [12]:

# --- MODEL TRAINING ---
# Create the specific metric function for our validation set using the factory
eval_metric_function_2 = lgb_map_at_k_factory(group_ids=val2['id2'], k=7)

model2 = lgb.LGBMClassifier(
    objective='binary',
    n_estimators=2000,
    learning_rate=0.05,
    n_jobs=-1,
    random_state=42,
    is_unbalance=True
)


In [14]:

# Train the model using your exact metric for evaluation
model2.fit(
    X_train_2, y_train_2,
    eval_set=[(X_val_2, y_val_2)],
    eval_metric=eval_metric_function,
    callbacks=[lgb.early_stopping(50, verbose=True)]
)

print("\nTraining complete!")
print(f"Best MAP@k on validation set: {model2.best_score_['valid_0']['map@k']:.4f}")

[LightGBM] [Info] Number of positive: 8907, number of negative: 115471
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022468 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 40949
[LightGBM] [Info] Number of data points in the train set: 124378, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.071612 -> initscore=-2.562182
[LightGBM] [Info] Start training from score -2.562182


  total_relevant_per_group = df[df['y_true'] == 1].groupby('group')['y_true'].count()
  for group_id, group_df in df.groupby('group'):


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[267]	valid_0's binary_logloss: 0.0746879	valid_0's map@k: 0.626657

Training complete!
Best MAP@k on validation set: 0.6267
