# 2-step ranker

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
import gc


In [None]:
def make_map7_eval(id2_arr, id3_arr, y_arr):
    """
    Builds a closure for LightGBM evaluation.
    """

    def map7_eval(preds, dataset):
        # dataset labels
        true = y_arr
        
        # group by customer
        df = pd.DataFrame({
            'customer': id2_arr,
            'offer': id3_arr,
            'y': true,
            'p': preds
        })

        def apk(actual, predicted, k=7):
            if len(predicted) > k:
                predicted = predicted[:k]
            score = 0.0
            num_hits = 0.0
            for i, p in enumerate(predicted):
                if p in actual and p not in predicted[:i]:
                    num_hits += 1.0
                    score += num_hits / (i + 1.0)
            return score / min(len(actual), k) if actual else 0.0

        scores = []
        for cust, grp in df.groupby("customer"):
            actual = grp.loc[grp.y == 1, "offer"].tolist()
            predicted = grp.sort_values("p", ascending=False)["offer"].tolist()
            scores.append(apk(actual, predicted, 7))

        return "MAP@7", np.mean(scores), True

    return map7_eval


In [None]:
# Upload filepaths
train_data_fp = r"C:\Users\siddu\Desktop\Decision Science Track\Revision\train_df_date.parquet"
test_data_fp = r"C:\Users\siddu\Desktop\Decision Science Track\Revision\test_df_date.parquet"

# Load files into dataframes
train = pd.read_parquet(train_data_fp)
test = pd.read_parquet(test_data_fp)

# Check for number of rows and columns
print(train.shape)
print(test.shape)

In [None]:
#train[['id4','id12','id13']].head()

In [None]:
for col in ['id4','id12','id13']:
    if col in train.columns:
        train[col] = pd.to_datetime(train[col], errors='coerce').astype('int64') // 10**9
    if col in test.columns:
        test[col] = pd.to_datetime(test[col], errors='coerce').astype('int64') // 10**9


In [None]:
DROP_COLS = ['id1','id2','id3','id4','id5','id6', 'id7', 'id8', 'id12','id13','id9', 'id10', 'id11','f378','f374','y']

all_cols = list(train.columns)

FEATURES = [
    c for c in all_cols 
    if c not in DROP_COLS
]


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold

# --- 1. DEFINE YOUR FEATURES LIST ---
# !!! REPLACE THIS with your actual FEATURES list !!!
FEATURES = [
    c for c in all_cols 
    if c not in DROP_COLS
]

# --- 2. CONVERT DATETIME COLUMNS ---
# This converts 'id5' (and others) to a number (Unix timestamp)
datetime_cols = ['id4', 'id5', 'id12', 'id13']
for col in datetime_cols:
    if col in train.columns:
        train[col] = pd.to_datetime(train[col], errors='coerce').astype('int64') // 10**9
    if col in test.columns:
        test[col] = pd.to_datetime(test[col], errors='coerce').astype('int64') // 10**9

# --- 3. CONVERT CATEGORICAL COLUMNS ---
# This tells LightGBM to treat these as categories, not numbers
categorical_cols = [
    'f42', 'f48', 'f50', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 
    'f354', 'id10', 'id8'
]
for col in categorical_cols:
    if col in train.columns:
        # Using .astype('category') is the standard way
        train[col] = train[col].astype('category')
    if col in test.columns:
        test[col] = test[col].astype('category')

# --- 4. PRE-TRAINING CHECK ---
# We check for any columns that are NOT numeric and NOT category
numeric_cols = train[FEATURES].select_dtypes(include=np.number).columns
category_cols_in_features = [col for col in FEATURES if col in train.columns and train[col].dtype == 'category']

# All columns must be either in numeric_cols or category_cols_in_features
processed_cols = list(numeric_cols) + category_cols_in_features
unhandled_cols = [col for col in FEATURES if col not in processed_cols]

if unhandled_cols:
    print("Error: The following columns in FEATURES are still not numeric or category:")
    print(unhandled_cols)
    print("Please convert them or remove them from the FEATURES list.")
else:
    print("All feature columns are numeric or category. Proceeding to training.")
    
    # --- 5. START TRAINING LOOP (NOW SAFELY INSIDE 'ELSE' BLOCK) ---
    N_SPLITS = 5
    gkf = GroupKFold(n_splits=N_SPLITS)

    oof_step1 = np.zeros(len(train))
    models_step1 = []

    # Filter FEATURES to only include columns that exist in the train set
    # This prevents errors if a feature in the list was dropped
    final_features = [col for col in FEATURES if col in train.columns]
    print(f"Training on {len(final_features)} features.")

    for fold, (tr_idx, val_idx) in enumerate(gkf.split(train, groups=train['id2'])):
        print("Fold:", fold)
        
        tr = train.iloc[tr_idx]
        va = train.iloc[val_idx]

        # LightGBM will automatically handle the .astype('category') columns
        dtr = lgb.Dataset(tr[final_features], tr['y'])
        dva = lgb.Dataset(va[final_features], va['y'], reference=dtr)

        params = {
            'objective':'binary',
            'learning_rate':0.04,
            'num_leaves':63,
            'min_data_in_leaf':30,
            'feature_fraction':0.8,
            'bagging_fraction':0.8,
            'bagging_freq':5,
            'seed':fold + 11,
            'verbosity':-1
        }

        m = lgb.train(
            params,
            dtr,
            valid_sets=[dva],
            num_boost_round=2000,
            callbacks=[
                lgb.early_stopping(100),
                lgb.log_evaluation(50)
            ]
        )

        oof_step1[val_idx] = m.predict(va[final_features], num_iteration=m.best_iteration)
        models_step1.append(m)

    train['retrieval_pred'] = oof_step1
    test['retrieval_pred'] = np.mean([m.predict(test[final_features]) for m in models_step1], axis=0)
    
    print("\n--- Training Complete ---")
    print("OOF predictions added as 'retrieval_pred' to train.")
    print("Test predictions added as 'retrieval_pred' to test.")

In [None]:
K = 30

cand_train = (
    train
    .sort_values(['id2','retrieval_pred'], ascending=[True, False])
    .groupby('id2')
    .head(K)
    .reset_index(drop=True)
)

cand_test = (
    test
    .sort_values(['id2','retrieval_pred'], ascending=[True, False])
    .groupby('id2')
    .head(K)
    .reset_index(drop=True)
)


In [None]:
# 'final_features' is the list of good features from your first stage (cell 12)
# We just add the new feature we created in the first stage
features_rerank = final_features + ['retrieval_pred']

# Enforce intersection just to be safe
features_rerank = [
    c for c in features_rerank 
    if c in cand_train.columns and c in cand_test.columns
]

print(f"Reranking using {len(features_rerank)} features.")

models_rerank = []
oof_rerank = np.zeros(len(cand_train))

gkf = GroupKFold(n_splits=5)

for fold, (tr_idx, val_idx) in enumerate(gkf.split(cand_train, groups=cand_train['id2'])):
    print("Rerank fold:", fold)
    
    tr = cand_train.iloc[tr_idx].reset_index(drop=True)
    va = cand_train.iloc[val_idx].reset_index(drop=True)

    # --- 1. (THE FIX) Create group arrays for the ranker ---
    # This tells the ranker how many candidates each customer has
    tr_groups = tr.groupby('id2').size().to_numpy()
    va_groups = va.groupby('id2').size().to_numpy()

    # --- 2. (THE FIX) Add the 'group' parameter to lgb.Dataset ---
    dtr = lgb.Dataset(tr[features_rerank], tr['y'], group=tr_groups)
    dva = lgb.Dataset(va[features_rerank], va['y'], reference=dtr, group=va_groups)

    # --- 3. (THE FIX) Change parameters to be a RANKER ---
    params_r = {
        'objective': 'lambdarank',  # Use 'lambdarank' for ranking
        'metric': 'map',            # Use 'map' as the built-in metric
        'eval_at': [7],             # Tell it to calculate MAP@7
        'learning_rate': 0.03,
        'num_leaves': 63,
        'min_data_in_leaf': 20,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbosity': -1,
        'seed': fold + 42
    }

    m = lgb.train(
        params_r,
        dtr,
        valid_sets=[dva],
        # --- 4. (THE FIX) REMOVE the slow 'feval' parameter ---
        # feval=feval,
        num_boost_round=2000,
        callbacks=[
            lgb.early_stopping(100),
            lgb.log_evaluation(50)  # This will now print 'map@7' very quickly
        ]
    )

    oof_rerank[val_idx] = m.predict(va[features_rerank], num_iteration=m.best_iteration)
    models_rerank.append(m)

cand_train['rerank_pred'] = oof_rerank
cand_test['rerank_pred'] = np.mean(
    [m.predict(cand_test[features_rerank]) for m in models_rerank],
    axis=0
)

print("\n--- Rerank Training Complete ---")