# Importing

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from tqdm import tqdm

In [2]:
# 1. Load data
train = pd.read_parquet("data/train_data.parquet")
events = pd.read_parquet("data/add_event.parquet")
trans = pd.read_parquet("data/add_trans.parquet")
offers = pd.read_parquet("data/offer_metadata.parquet")
test = pd.read_parquet("data/test_data.parquet")
dict_df = pd.read_csv("data/data_dictionary.csv")

In [3]:
train.info(verbose=True,show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 770164 entries, 0 to 770163
Data columns (total 372 columns):
 #    Column  Non-Null Count   Dtype 
---   ------  --------------   ----- 
 0    id1     770164 non-null  object
 1    id2     770164 non-null  object
 2    id3     770164 non-null  object
 3    id4     770164 non-null  object
 4    id5     770164 non-null  object
 5    y       770164 non-null  object
 6    f1      278506 non-null  object
 7    f2      322972 non-null  object
 8    f3      108562 non-null  object
 9    f4      68869 non-null   object
 10   f5      538354 non-null  object
 11   f6      620055 non-null  object
 12   f7      402726 non-null  object
 13   f8      535649 non-null  object
 14   f9      485906 non-null  object
 15   f10     516499 non-null  object
 16   f11     413602 non-null  object
 17   f12     527792 non-null  object
 18   f13     696 non-null     object
 19   f14     696 non-null     object
 20   f15     696 non-null     object
 21   f16     

# Preprocess

## Deleting unnecessary cols

In [4]:
# 4. Drop features with >95% missing
missing_pct = train.isna().mean()
drop_cols = missing_pct[missing_pct > 0.95].index
train.drop(columns=drop_cols, inplace=True)
test.drop(columns=[c for c in drop_cols if c in test.columns], inplace=True)
train.shape

(770164, 340)

In [5]:
cols_to_drop = []
for col in tqdm(train.columns,desc="Dropping cols: " ):
    unique_vals = train[col].dropna().unique()
    nunique = train[col].nunique(dropna=True)
    if nunique == 1 or set(unique_vals).issubset({0.0}):
        cols_to_drop.append(col)

train.drop(columns=cols_to_drop, inplace=True)
test.drop(columns=cols_to_drop, inplace=True)

print(f"Dropped {len(cols_to_drop)} columns")
print(f"train shape: {train.shape}")

Dropping cols: 100%|██████████| 340/340 [00:28<00:00, 12.09it/s]


Dropped 47 columns
train shape: (770164, 293)


In [6]:
# 2. Standardize missing markers
for df in [train, test, events, trans, offers]:
    df.replace({-9999.0: pd.NA, None: pd.NA}, inplace=True)

## Converting datatypes 

In [7]:
# 1) Lists from the data dictionary
cat_cols = dict_df.loc[dict_df['Type'] == 'Categorical','masked_column'].tolist()
num_cols = dict_df.loc[dict_df['Type'] == 'Numerical','masked_column'].tolist()
ohe_cols = dict_df.loc[dict_df['Type'] == 'One hot encoded','masked_column'].tolist()

# 2) Keep only columns that really exist in *train*
cat_cols = [c for c in cat_cols if c in train.columns]
num_cols = [c for c in num_cols if c in train.columns]
ohe_cols = [c for c in ohe_cols if c in train.columns]
print(f"cat_cols : {len(cat_cols)} num_cols : {len(num_cols)} ohe_cols : {len(ohe_cols)}")

cat_cols : 12 num_cols : 229 ohe_cols : 48


In [8]:
num_cols.remove('id5')
num_cols.remove('id4')
cat_cols.remove('id3')

In [9]:
for df in [train, test]:
    df['id4'] = pd.to_datetime(df['id4'])
    df['id5'] = pd.to_datetime(df['id5'], errors='coerce').dt.date
offers['id12'] = pd.to_datetime(offers['id12'])
offers['id13'] = pd.to_datetime(offers['id13'])
events['id4'] = pd.to_datetime(events['id4'])
events['id7'] = pd.to_datetime(events['id7'])
trans['f370'] = pd.to_datetime(trans['f370'])

In [10]:
for col in ohe_cols:
    print(f"{col}'s unique values : {train[col].unique()}")

f227's unique values : ['0.0' '1.0' <NA>]
f228's unique values : ['0.0' '1.0' <NA>]
f230's unique values : ['0.0' '1.0' <NA>]
f231's unique values : ['1.0' '0.0' <NA>]
f232's unique values : ['0.0' '1.0' <NA>]
f233's unique values : ['0.0' '1.0' <NA>]
f234's unique values : ['0.0' '1.0' <NA>]
f235's unique values : ['0.0' '1.0' <NA>]
f237's unique values : ['0.0' '1.0' <NA>]
f239's unique values : ['0.0' '1.0' <NA>]
f241's unique values : ['0.0' '1.0' <NA>]
f242's unique values : ['0.0' '1.0' <NA>]
f244's unique values : ['0.0' '1.0' <NA>]
f247's unique values : ['0.0' '1.0' <NA>]
f250's unique values : ['0.0' '1.0' <NA>]
f251's unique values : ['0.0' <NA> '1.0']
f252's unique values : ['0.0' '1.0' <NA>]
f253's unique values : ['0.0' '1.0' <NA>]
f254's unique values : ['0.0' '1.0' <NA>]
f255's unique values : ['0.0' '1.0' <NA>]
f256's unique values : ['0.0' '1.0' <NA>]
f257's unique values : ['0.0' '1.0' <NA>]
f261's unique values : ['0.0' '1.0' <NA>]
f263's unique values : ['0.0' '1.0

In [11]:
for col in tqdm(cat_cols, desc="Casting categorical columns"):
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')

for col in tqdm(num_cols, desc="Casting numerical columns"):
    train[col] = pd.to_numeric(train[col], errors='coerce').astype('float32')
    test[col] = pd.to_numeric(test[col], errors='coerce').astype('float32')

for col in tqdm(ohe_cols, desc="Cleaning and casting one-hot encoded columns"):
    train[col] = pd.to_numeric(train[col], errors='coerce').astype('Int8')
    test[col] = pd.to_numeric(test[col], errors='coerce').astype('Int8')

train.info()

Casting categorical columns: 100%|██████████| 11/11 [00:00<00:00, 12.08it/s]
Casting numerical columns: 100%|██████████| 227/227 [01:07<00:00,  3.34it/s]
Cleaning and casting one-hot encoded columns: 100%|██████████| 48/48 [00:19<00:00,  2.49it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 770164 entries, 0 to 770163
Columns: 293 entries, id1 to f366
dtypes: Int8(48), category(11), datetime64[ns](1), float32(227), object(6)
memory usage: 786.6+ MB





## Feature Engineer

In [12]:
def feature_engineer(df, events, offers):
    ctr = events.assign(clicked=events['id7'].notna().astype(int))
    df = df.copy()

    # unify keys
    df['id3'] = df['id3'].astype(str)
    ctr['id3'] = ctr['id3'].astype(str)
    offers_loc = offers.copy()
    offers_loc['id3'] = offers_loc['id3'].astype(str)

    # 1) Existing CTR Features (Offer-level only)
    o_imp = ctr.groupby('id3').size().rename('imps')
    o_click = ctr[ctr.clicked == 1].groupby('id3').size().rename('clicks')
    offer_stats = (
        pd.concat([o_imp, o_click], axis=1)
          .fillna(0)
          .assign(ctr=lambda x: x.clicks / x.imps)
          .reset_index()
    )
    df = df.merge(offer_stats[['id3', 'ctr']], on='id3', how='left')

    stats = offer_stats.copy()
    a, b = 1, 1
    stats['ctr_smooth'] = (stats.clicks + a) / (stats.imps + a + b)
    df = df.merge(stats[['id3', 'ctr_smooth']], on='id3', how='left')

    # 2) Offer Metadata
    df = df.merge(
        offers_loc[['id3', 'f375', 'f376', 'id12', 'id13', 'id11', 'id10','id8']],
        on='id3', how='left'
    )

    # 3) Popularity in last 30 days (offer-level only)
    last_date = df['id4'].max()
    cutoff_pop = last_date - pd.Timedelta(days=30)
    recent_imps = events[events['id4'] >= cutoff_pop].astype({'id3': str})
    total_recent = len(recent_imps)
    pop30 = recent_imps.groupby('id3').size().rename('offer_imps_30d')
    df['offer_popularity_30d'] = (
        df['id3']
          .map(pop30.div(total_recent))
          .fillna(0)
    )

    # 4) Temporal & Sequence Features
    df['hour'] = df['id4'].dt.hour
    df['dow'] = df['id4'].dt.dayofweek
    df = df.sort_values(['id2', 'id4'])
    df['prev_time'] = df.groupby('id2')['id4'].shift()
    df['secs_prev'] = (
        df['id4'] - df['prev_time']
    ).dt.total_seconds().fillna(-1)
    df['days_to_exp'] = (
        df['id13'] - df['id4']
    ).dt.days.clip(lower=0)

    # 5) Frequency & Brand Indicator
    df['ind_match'] = 0
    df['brand_freq'] = df['id11'].map(
        df['id11'].value_counts(normalize=True)
    )
        # --- 6) Event-driven offer trends ---

    # Recent click rate (7-day window)
    cutoff_7d = last_date - pd.Timedelta(days=7)
    recent_events = events[events['id4'] >= cutoff_7d].copy()
    recent_events['id3'] = recent_events['id3'].astype(str)
    recent_events['clicked'] = recent_events['id7'].notna().astype(int)

    recent_clicks = recent_events.groupby('id3')['clicked'].agg(['sum', 'count']).rename(
        columns={'sum': 'recent_clicks', 'count': 'recent_impressions'}
    )
    recent_clicks['offer_recent_click_rate'] = recent_clicks['recent_clicks'] / recent_clicks['recent_impressions']
    df = df.merge(recent_clicks[['offer_recent_click_rate']], on='id3', how='left')

    # Median hour of clicks per offer
    events['click_hour'] = events['id7'].dt.hour
    click_events = events[events['id7'].notna()].copy()

    # Total impressions & clicks per offer
    click_count = click_events['id3'].value_counts().rename('offer_click_count')
    imp_count = events['id3'].value_counts().rename('offer_impression_count')
    df = df.merge(imp_count, left_on='id3', right_index=True, how='left')
    df = df.merge(click_count, left_on='id3', right_index=True, how='left')

    # CTR by offer + day of week
    events['dow'] = events['id4'].dt.dayofweek
    offer_dow_ctr = (
        events.assign(clicked=events['id7'].notna().astype(int))
              .groupby(['id3', 'dow'])['clicked']
              .agg(['sum', 'count'])
              .reset_index()
    )
    df = df.merge(
        offer_dow_ctr[['id3', 'dow']],
        on=['id3', 'dow'], how='left'
    )
    return df

In [13]:
train = feature_engineer(train, events, offers)
test = feature_engineer(test, events, offers)

In [14]:
cols_to_drop = ['id11']
int_cols = ['id2', 'id3', 'y']
datetime_cols = ['id5']
float_cols = ['f218']

for df in [train, test]:
    df.drop(columns=[col for col in cols_to_drop if col in df.columns], inplace=True)

for col in int_cols:
    if col in train.columns:
        train[col] = pd.to_numeric(train[col], errors='coerce').astype('Int32')
    if col in test.columns:
        test[col] = pd.to_numeric(test[col], errors='coerce').astype('Int32')

for col in datetime_cols:
    if col in train.columns:
        train[col] = pd.to_datetime(train[col], errors='coerce')
    if col in test.columns:
        test[col] = pd.to_datetime(test[col], errors='coerce')

for col in float_cols:
    if col in train.columns:
        train[col] = pd.to_numeric(train[col], errors='coerce').astype('float32')
    if col in test.columns:
        test[col] = pd.to_numeric(test[col], errors='coerce').astype('float32')


In [15]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 770164 entries, 0 to 770163
Columns: 312 entries, id1 to offer_click_count
dtypes: Int32(3), Int8(48), category(11), datetime64[ns](5), float32(228), float64(10), int32(2), int64(2), object(3)
memory usage: 882.9+ MB


## PreProcessing

### PREPREprocess

In [16]:
freq = train['id8'].value_counts(normalize=True)
train['id8_freq'] = train['id8'].map(freq)
test ['id8_freq'] = test ['id8'].map(freq).fillna(0)  # unseen codes → 0

# 3) (Optionally) drop the raw id8 if you don’t plan to one‐hot it
train.drop(columns=['id8'], inplace=True)
test.drop(columns=['id8'], inplace=True)

In [17]:
# 1. Define features
num_features = [
    'ctr', 'ctr_smooth', 'secs_prev', 'days_to_exp',
    'brand_freq', 'offer_popularity_30d',
    'offer_recent_click_rate',
    'offer_impression_count', 'offer_click_count', 'id8_freq'
]
cat_ord = ['id10', 'f42', 'f48', 'f53', 'f349']
cat_ohe = ['f375', 'f376', 'hour', 'dow',
            'f50', 'f52', 'f54', 'f55', 'f56', 'f57', 'f354']
num_features += num_cols

### Test 

In [18]:
X = train.copy()
X_test = test.copy()

# --- Handle categorical columns ---
for col in cat_ohe + cat_ord:
    X[col] = X[col].astype(str)
    X_test[col] = X_test[col].astype(str)

# --- Impute and scale numerical features ---
for col in num_features:
    median = X[col].median()
    X[col] = X[col].fillna(median)
    X_test[col] = X_test[col].fillna(median)

    mean = X[col].mean()
    std = X[col].std()
    if std == 0: std = 1  # avoid division by zero

    X[col] = (X[col] - mean) / std
    X_test[col] = (X_test[col] - mean) / std

# --- One-hot encode categorical features ---
X_ohe = pd.get_dummies(X[cat_ohe], dummy_na=False)
X_test_ohe = pd.get_dummies(X_test[cat_ohe], dummy_na=False)

# Align columns to ensure consistency between train and test
X_ohe, X_test_ohe = X_ohe.align(X_test_ohe, join='outer', axis=1, fill_value=0)

# --- Ordinal encode categorical features ---
from sklearn.preprocessing import OrdinalEncoder

ord_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
X_ord = pd.DataFrame(
    ord_encoder.fit_transform(X[cat_ord]),
    columns=cat_ord,
    index=X.index
)
X_test_ord = pd.DataFrame(
    ord_encoder.transform(X_test[cat_ord]),
    columns=cat_ord,
    index=X_test.index
)

# --- Already one-hot-encoded columns: Impute NaNs only ---
for col in ohe_cols:
    X[col] = X[col].fillna(0)
    X_test[col] = X_test[col].fillna(0)

# --- Final concatenation ---
X = pd.concat([X[num_features], X_ohe, X_ord, X[ohe_cols]], axis=1)
X_test = pd.concat([X_test[num_features], X_test_ohe, X_test_ord, X_test[ohe_cols]], axis=1)


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


### Test end

In [None]:
# 3. Pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
ohe_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=np.nan)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # return array
])
ord_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=np.nan)),
    ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan))
])
alreadyohe_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=np.nan))
])

# 4. Preprocessor
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('ohe', ohe_pipeline, cat_ohe),
    ('ord', ord_pipeline, cat_ord),
    ('pass', alreadyohe_pipeline, ohe_cols)
], remainder='drop')

# 5. Collect all used features (no duplicates)
all_features = []
for col in num_features + cat_ohe + cat_ord + ohe_cols:
    if col not in all_features:
        all_features.append(col)

In [None]:
# 6. Fit-transform and transform
with tqdm(total=2, desc="Fitting and transforming datasets") as pbar:
    X = preprocessor.fit_transform(train[all_features])
    pbar.update(1)
    X_test = preprocessor.transform(test[all_features])
    pbar.update(1)
y = train['y'].values

Fitting and transforming datasets: 100%|██████████| 2/2 [00:49<00:00, 24.64s/it]


## Continue

In [19]:
y = train['y']

# Model

## Prep 1

In [20]:
# 8. Split and group
split_cut = train['id4'].quantile(0.8)
mask_val = train['id4'] > split_cut
mask_val_np = mask_val.to_numpy()  # Convert to NumPy array for sparse matrix indexing

X_trn, y_trn = X[~mask_val_np], y[~mask_val_np]
X_val, y_val = X[mask_val_np], y[mask_val_np]

id2_trn = train.loc[~mask_val, 'id2'].astype(str)
id2_val = train.loc[mask_val, 'id2'].astype(str)
# 1. Sort everything by group ID
trn_sort_idx = id2_trn.argsort()
X_trn = X_trn.iloc[trn_sort_idx]
y_trn = y_trn.iloc[trn_sort_idx]
id2_trn = id2_trn.iloc[trn_sort_idx]

val_sort_idx = id2_val.argsort()
X_val = X_val.iloc[val_sort_idx]
y_val = y_val.iloc[val_sort_idx]
id2_val = id2_val.iloc[val_sort_idx]

# 2. Now recompute group sizes on sorted id2
group_trn = id2_trn.groupby(id2_trn, sort=False).size().values
group_val = id2_val.groupby(id2_val, sort=False).size().values

## Prep 2

In [28]:
# Group by id2 and assign whole groups randomly to train/val
unique_ids = train['id2'].unique()
np.random.seed(42)
val_ids = set(np.random.choice(unique_ids, size=int(0.2 * len(unique_ids)), replace=False))
mask_val = train['id2'].isin(val_ids)
mask_val_np = mask_val.to_numpy()

X_trn, y_trn = X[~mask_val_np], y[~mask_val_np]
X_val, y_val = X[mask_val_np], y[mask_val_np]

id2_trn = train.loc[~mask_val, 'id2'].astype(str)
id2_val = train.loc[mask_val, 'id2'].astype(str)

# Filter out id2s with all-zero labels
valid_ids_trn = id2_trn[y_trn > 0].unique()
valid_ids_val = id2_val[y_val > 0].unique()

mask_trn = id2_trn.isin(valid_ids_trn).to_numpy()
mask_val = id2_val.isin(valid_ids_val).to_numpy()

X_trn, y_trn = X_trn[mask_trn], y_trn[mask_trn]
X_val, y_val = X_val[mask_val], y_val[mask_val]
id2_trn = id2_trn.iloc[mask_trn]
id2_val = id2_val.iloc[mask_val]

trn_sort_idx = id2_trn.argsort()
X_trn = X_trn[trn_sort_idx]
y_trn = y_trn[trn_sort_idx]
id2_trn = id2_trn.iloc[trn_sort_idx]

val_sort_idx = id2_val.argsort()
X_val = X_val[val_sort_idx]
y_val = y_val[val_sort_idx]
id2_val = id2_val.iloc[val_sort_idx]

group_trn = id2_trn.groupby(id2_trn, sort=False).size().values
group_val = id2_val.groupby(id2_val, sort=False).size().values


## Training

In [27]:
import lightgbm as lgb
from lightgbm import Dataset, early_stopping, log_evaluation
train_lgb = lgb.train 
# 1. Construct LightGBM Datasets
train_set = Dataset(X_trn, label=y_trn, group=group_trn,categorical_feature=cat_ord)
val_set   = Dataset(X_val, label=y_val, group=group_val,categorical_feature=cat_ord)

# 2. Improved Parameters for MAP@7 ranking
params = {
    'objective': 'lambdarank',
    'metric': ['map'],
    'eval_at': [7],
    'ndcg_eval_at': [7],
    'max_position': 7,

    'learning_rate': 0.01,
    'num_leaves': 63,
    'min_data_in_leaf': 50,
    'min_gain_to_split': 0.05,

    'feature_fraction': 0.7,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,

    'boosting_type': 'gbdt',
    'verbosity': 1,
    'device_type' : 'gpu',
    'gpu_platform_id': 1,  # <-- set your platform id here
    'gpu_device_id':   0 ,
}

# 3. Train with callbacks for early stopping & logging
model = train_lgb(
    params,
    train_set,
    num_boost_round=2000,                   # allow enough rounds
    valid_sets=[train_set, val_set],
    valid_names=['train', 'valid'],
    callbacks=[
        early_stopping(stopping_rounds=100),
        log_evaluation(period=50)
    ]
)

[LightGBM] [Info] Total groups: 38065, total data: 616131
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 48735
[LightGBM] [Info] Number of data points in the train set: 616131, number of used features: 373
[LightGBM] [Info] Using requested OpenCL platform 1 device 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3070 Ti Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 184 dense feature groups (108.12 MB) transferred to GPU in 0.100019 secs. 1 sparse feature groups
[LightGBM] [Info] Total groups: 11010, total data: 154033
Training until validation scores don't improve for 100 rounds
[50]	train's map@7: 0.960761	valid's map@7: 0.960687
[100]	train's map@7: 0.962433	valid's map@7: 0.960914
[150]	train's map@7: 0.963923	valid's map@7: 0.961188
[200]	train's map@7: 0.965347	valid's map@7: 0.96149


In [29]:
from lightgbm import early_stopping, log_evaluation

# 9. LightGBM ranking (MAP@7) with callbacks for early stopping & logging
train_set = lgb.Dataset(X_trn, label=y_trn, group=group_trn)
val_set   = lgb.Dataset(X_val, label=y_val, group=group_val)

params = {
    'objective':       'lambdarank',
    'metric':          'map',
    'eval_at':         [7],
    'learning_rate':   0.01,
    'num_leaves':      32,
    'feature_fraction':0.8,
    'bagging_fraction':0.8,
    # 'lambda_l1':       1.0,
    'bagging_freq':     5,
    'verbosity':       1,
    'device_type' : 'gpu',
    'gpu_platform_id': 1,  # <-- set your platform id here
    'gpu_device_id':   0 ,
}

model = train_lgb(
    params,
    train_set,
    num_boost_round=1000,
    valid_sets=[train_set, val_set],
    valid_names=['train', 'valid'],
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(period=100)
    ]
)

[LightGBM] [Info] Total groups: 38065, total data: 616131
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 48732
[LightGBM] [Info] Number of data points in the train set: 616131, number of used features: 373
[LightGBM] [Info] Using requested OpenCL platform 1 device 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3070 Ti Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 184 dense feature groups (108.12 MB) transferred to GPU in 0.101346 secs. 1 sparse feature groups
[LightGBM] [Info] Total groups: 11010, total data: 154033
Training until validation scores don't improve for 50 rounds
[100]	train's map@7: 0.958791	valid's map@7: 0.960113
[200]	train's map@7: 0.960329	valid's map@7: 0.960665
[300]	train's map@7: 0.961998	valid's map@7: 0.961035
[400]	train's map@7: 0.963595	valid's map@7: 0.961384

In [30]:
def map_at_k(ids, y_true, y_pred, k=7):
    dfm = pd.DataFrame({
        'id2': ids,
        'y': y_true,
        'p': y_pred
    })
    # ensure y is numeric
    dfm['y'] = dfm['y'].astype(int)

    aps = []
    for _, g in dfm.groupby('id2'):
        top = g.sort_values('p', ascending=False).head(k)
        rel = top['y'].values
        if rel.sum() == 0:
            continue
        hits = 0
        score = 0.0
        for i, r in enumerate(rel, 1):
            if r == 1:
                hits += 1
                score += hits / i
        aps.append(score / min(rel.sum(), k))
    return np.mean(aps)

# Then:
val_preds = model.predict(X_val, num_iteration=model.best_iteration)
validation_map7 = map_at_k(id2_val, y_val, val_preds)
print("Validation MAP@7:", validation_map7)

Validation MAP@7: 0.7587825740532433


# Ensemble Model

In [None]:
# Assuming `prev_lgb_model` is your previously trained standalone LightGBM model
import pandas as pd

# Get feature importances
importances = model.feature_importance(importance_type='gain')
feature_names = model.feature_name()

# Create DataFrame of feature importances
feat_imp_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

# Extract top 50 features
top50_features = feat_imp_df['feature'].head(50).tolist()


In [32]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostRanker
import lightgbm as lgb
from lightgbm import Dataset, early_stopping, log_evaluation

In [23]:
cat_ohe_used = [col for col in cat_ohe if col in X_trn.columns]
ohe_cols_used = [col for col in ohe_cols if col in X_trn.columns]
cat_ord_used = [col for col in cat_ord if col in X_trn.columns]
num_features_used = [col for col in num_features if col in X_trn.columns]

In [24]:

# One-Hot Pipeline for XGBoost
xgb_pipeline = Pipeline([
    ('model', XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        tree_method='gpu_hist',
        predictor='gpu_predictor',
        use_label_encoder=False,
        random_state=42
    ))
])

# CatBoost uses raw categorical features
cat_pipeline = Pipeline([
    ('model', CatBoostClassifier(
        cat_features=cat_ord,
        iterations=500,
        learning_rate=0.05,
        depth=6,
        verbose=0,
        task_type='GPU',
        random_state=42,
        loss_function='Logloss'
    ))
])

# Logistic Regression on scaled numeric features
logreg_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # handles NaNs
    ('model', LogisticRegression(
        penalty='l2',
        solver='saga',
        class_weight='balanced',
        max_iter=1000,
        tol=1e-3,
        random_state=42
    ))
])


In [25]:
for col in cat_ord_used:
    X_trn[col] = X_trn[col].astype(str)
    X_val[col] = X_val[col].astype(str)
    X_test[col] = X_test[col].astype(str)

In [None]:
from lightgbm import LGBMRanker

# Train base models
base_models = [
    ("XGBoost", xgb_pipeline, X_trn[cat_ohe_used+ohe_cols_used], y_trn),
    ("CatBoost", cat_pipeline, X_trn[cat_ord_used], y_trn),
    ("LogReg", logreg_pipeline, X_trn[num_features_used], y_trn)
]
for name, pipe, X, y_ in tqdm(base_models, desc="Fitting base models"):
    y_ = pd.to_numeric(y_) 
    print(f"Fitting {name}...")
    pipe.fit(X, y_)

# Generate predictions from base models using correct slices
Z_trn = np.column_stack([
    xgb_pipeline.predict_proba(X_trn[cat_ohe_used+ohe_cols_used])[:, 1],
    cat_pipeline.predict_proba(X_trn[cat_ord_used])[:, 1],
    logreg_pipeline.predict_proba(X_trn[num_features_used])[:, 1],
])

Z_val = np.column_stack([
    xgb_pipeline.predict_proba(X_val[cat_ohe_used+ohe_cols_used])[:, 1],
    cat_pipeline.predict_proba(X_val[cat_ord_used])[:, 1],
    logreg_pipeline.predict_proba(X_val[num_features_used])[:, 1],
])

Fitting base models:   0%|          | 0/3 [00:00<?, ?it/s]

Fitting XGBoost...



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Fitting base models:  33%|███▎      | 1/3 [00:01<00:03,  1.58s/it]

Fitting CatBoost...


Fitting base models:  67%|██████▋   | 2/3 [00:20<00:11, 11.92s/it]

Fitting LogReg...


Fitting base models: 100%|██████████| 3/3 [14:37<00:00, 292.47s/it]

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


In [36]:
group_trn = id2_trn.groupby(id2_trn, sort=False).size().values
group_val = id2_val.groupby(id2_val, sort=False).size().values

train_set = lgb.Dataset(Z_trn, label=y_trn, group=group_trn)
val_set = lgb.Dataset(Z_val, label=y_val, group=group_val)
print("Training meta model")
params = {
    'objective': 'lambdarank',
    'metric': 'map',
    'eval_at': [7],
    'learning_rate': 0.01,
    'num_leaves': 32,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbosity': 1,
    'device_type': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0
}

model = lgb.train(
    params,
    train_set,
    num_boost_round=1000,
    valid_sets=[train_set, val_set],
    valid_names=['train', 'valid'],
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(period=100)
    ]
)

Training meta model
[LightGBM] [Info] Total groups: 38065, total data: 616131
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 527
[LightGBM] [Info] Number of data points in the train set: 616131, number of used features: 3
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3070 Ti Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 3 dense feature groups (2.35 MB) transferred to GPU in 0.005233 secs. 0 sparse feature groups
[LightGBM] [Info] Total groups: 11010, total data: 154033
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 3 dense feature groups (1.88 MB) transferred to GPU in 0.002776 secs. 0 sparse feature groups
Training until validation scores don't improve for 50 rounds
[LightGBM] [Info] Size of histogram bin en

In [None]:
# Extract top 50 features for stacking
X_meta_train = X_trn[top50_features].copy()
X_meta_val   = X_val[top50_features].copy()

In [None]:
# Generate predictions from base models using correct slices
Z_trn = np.column_stack([
    xgb_pipeline.predict_proba(X_trn[cat_ohe_used+ohe_cols_used])[:, 1],
    cat_pipeline.predict_proba(X_trn[cat_ord_used])[:, 1],
    logreg_pipeline.predict_proba(X_trn[num_features_used])[:, 1],
])

Z_val = np.column_stack([
    xgb_pipeline.predict_proba(X_val[cat_ohe_used+ohe_cols_used])[:, 1],
    cat_pipeline.predict_proba(X_val[cat_ord_used])[:, 1],
    logreg_pipeline.predict_proba(X_val[num_features_used])[:, 1],
])

group_column = 'id2'
group_trn = X_trn[group_column].value_counts(sort=False).sort_index().values
group_val = X_val[group_column].value_counts(sort=False).sort_index().values

X_meta_train = np.hstack([X_meta_train.values, Z_trn])
X_meta_val = np.hstack([X_meta_val.values, Z_val])

train_set = lgb.Dataset(X_meta_train, label=y_trn, group=group_trn)
val_set = lgb.Dataset(X_meta_val, label=y_val, group=group_val)
print("Training meta model")
params = {
    'objective': 'lambdarank',
    'metric': 'map',
    'eval_at': [7],
    'learning_rate': 0.01,
    'num_leaves': 16,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbosity': 1,
    'device_type': 'gpu',
    'gpu_platform_id': 1,
    'gpu_device_id': 0
}

meta_model = lgb.train(
    params,
    train_set,
    num_boost_round=1000,
    valid_sets=[train_set, val_set],
    valid_names=['train', 'valid'],
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(period=100)
    ]
)

In [40]:
def map_at_k(ids, y_true, y_pred, k=7):
    dfm = pd.DataFrame({
        'id2': ids,
        'y': y_true,
        'p': y_pred
    })
    # ensure y is numeric
    dfm['y'] = dfm['y'].astype(int)

    aps = []
    for _, g in dfm.groupby('id2'):
        top = g.sort_values('p', ascending=False).head(k)
        rel = top['y'].values
        if rel.sum() == 0:
            continue
        hits = 0
        score = 0.0
        for i, r in enumerate(rel, 1):
            if r == 1:
                hits += 1
                score += hits / i
        aps.append(score / min(rel.sum(), k))
    return np.mean(aps)

In [45]:
train_preds = model.predict(Z_trn, num_iteration=model.best_iteration)
train_map7 = map_at_k(id2_trn, y_trn, train_preds)
print("Train MAP@7 (meta-model):", train_map7)

Train MAP@7 (meta-model): 0.6759015868142365


In [30]:
from scipy.special import expit
import pandas as pd

# 1) Get raw scores
raw_preds = model.predict(X_test, num_iteration=model.best_iteration)

# 2) Convert to (0,1) via sigmoid
prob_preds = expit(raw_preds)

# 3) Sanity‑check in memory
print("raw_preds[:5]       :", raw_preds[:5])
print("prob_preds[:5]      :", prob_preds[:5])
print("raw_preds min/max   :", raw_preds.min(), raw_preds.max())
print("prob_preds min/max  :", prob_preds.min(), prob_preds.max())

sub = pd.DataFrame({
    'id1':  test['id1'],
    'id2':  test['id2'],
    'id3':  test['id3'],
    'id5':  test['id5'],
    'pred': prob_preds      # <<-- make sure this is prob_preds
})

# 5) Save to CSV
csv_path = 'r2_submission_file_sigmoid_new.csv'
sub.to_csv(csv_path, index=False)
print(f"Saved sigmoid submission to {csv_path!r}")

# 6) Reload and verify on‑disk values
df_check = pd.read_csv(csv_path)
df_check.head()
print("Reloaded CSV pred min/max:", df_check['pred'].min(), df_check['pred'].max())
print(df_check.head(5))

raw_preds[:5]       : [-0.64043362 -3.93230879 -2.64171247  0.05214257 -3.69812285]
prob_preds[:5]      : [0.34514853 0.01922166 0.06650165 0.51303269 0.02417126]
raw_preds min/max   : -4.754149325862651 3.587385916268742
prob_preds min/max  : 0.008542271764755725 0.9730744754414176
Saved sigmoid submission to 'r2_submission_file_sigmoid_new.csv'
Reloaded CSV pred min/max: 0.0085422717647557 0.9730744754414176
                                             id1      id2      id3  \
0     1000061_9914_16-23_2023-11-05 09:11:35.557  1000061     9914   
1    1000061_23690_16-23_2023-11-05 09:11:36.193  1000061    23690   
2   1000061_522188_16-23_2023-11-05 09:11:37.242  1000061   522188   
3  1000061_5420674_16-23_2023-11-05 09:28:04.153  1000061  5420674   
4    1000061_27945_16-23_2023-11-05 09:28:04.157  1000061    27945   

          id5      pred  
0  2023-11-05  0.345149  
1  2023-11-05  0.019222  
2  2023-11-05  0.066502  
3  2023-11-05  0.513033  
4  2023-11-05  0.024171  


In [None]:
# 1. Manual feature name construction
num_names = num_features

# Nested pipeline access to OneHotEncoder
ohe_pipeline = preprocessor.named_transformers_['ohe']
ohe_encoder = ohe_pipeline.named_steps['onehot']
ohe_cols = ohe_encoder.get_feature_names_out(cat_ohe)

ord_cols = cat_ord
all_feat_names = np.concatenate([num_names, ohe_cols, ord_cols])

# 2. Get importances from LightGBM
importances = model.feature_importance(importance_type='gain')

# 3. Truncate feat_names to match importances length
feat_names = all_feat_names[:len(importances)]

# 4. Build DataFrame
feat_imp = pd.DataFrame({
    'feature': feat_names,
    'importance': importances
})

# 5. Sort and display top 50
top50 = feat_imp.sort_values('importance', ascending=False).tail(50)

print("Top 50 features by importance:")
for i, (f, imp) in enumerate(zip(top50['feature'], top50['importance']), 1):
    print(f"{i:2d}. {f:30s} {imp:.1f}")