In [21]:
import pandas as pd
import numpy as np
import catboost
from catboost import CatBoostClassifier

In [13]:
train=pd.read_parquet('../Data/small_train_engineered.parquet')
val=pd.read_parquet('../Data/small_val_engineered.parquet')

In [14]:
# --- YOUR HACKATHON METRIC FUNCTION ---
def map_at_k_hackathon(y_true, y_pred_scores, group_ids, k=7):
    """
    Calculates the Mean Average Precision at k.
    """
    df = pd.DataFrame({'group': group_ids, 'y_true': y_true, 'score': y_pred_scores})
    
    average_precisions = []
    total_relevant_per_group = df[df['y_true'] == 1].groupby('group')['y_true'].count()

    for group_id, group_df in df.groupby('group'):
        total_relevant = total_relevant_per_group.get(group_id, 0)
        if total_relevant == 0:
            continue

        group_df = group_df.sort_values('score', ascending=False).head(k)
        
        hits = 0
        precision_sum = 0.0
        
        for i, row in enumerate(group_df.itertuples(index=False)):
            rank = i + 1
            if row.y_true == 1:
                hits += 1
                precision_at_k = hits / rank
                precision_sum += precision_at_k
        
        ap = precision_sum / total_relevant
        average_precisions.append(ap)

    return np.mean(average_precisions) if average_precisions else 0.0


In [15]:
cat_features_names = train.select_dtypes(include=['category']).columns.tolist()
for df in [train, val]:
    for col in cat_features_names:
        if df[col].dtype.name == 'category':
            df[col] = df[col].cat.add_categories('NA')
    df[cat_features_names] = df[cat_features_names].fillna('NA')
print("Successfully handled null values.")

Successfully handled null values.


In [16]:
cat_features_names.remove('id2')
cat_features_names.remove('id3')

In [17]:
TARGET = 'y'
FEATURES = [col for col in train.columns if col not in [TARGET, 'id2', 'id3','id4','id5']]
X_train = train[FEATURES]
y_train = train[TARGET]
X_val = val[FEATURES]
y_val = val[TARGET]
cat_features_indices = [X_train.columns.get_loc(col) for col in cat_features_names]

In [22]:
model = CatBoostClassifier(
    iterations=1500,
    learning_rate=0.05,
    verbose=100,
    random_seed=42,
    cat_features=cat_features_indices
)

In [23]:
print("\nStarting CatBoost model training...")
model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    early_stopping_rounds=50,
    use_best_model=True
)



Starting CatBoost model training...
0:	learn: 0.6084671	test: 0.6039701	best: 0.6039701 (0)	total: 186ms	remaining: 4m 38s
100:	learn: 0.0911175	test: 0.1073385	best: 0.1071801 (98)	total: 11s	remaining: 2m 32s
200:	learn: 0.0777198	test: 0.1054707	best: 0.1053802 (195)	total: 20.4s	remaining: 2m 11s
300:	learn: 0.0711087	test: 0.1050789	best: 0.1045662 (269)	total: 29.5s	remaining: 1m 57s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.1045661923
bestIteration = 269

Shrink model to first 270 iterations.


<catboost.core.CatBoostClassifier at 0x1776519a0>

In [24]:
val_probabilities = model.predict_proba(X_val)[:, 1]

# Calculate your final MAP@7 score using your function
final_map_score = map_at_k_hackathon(
    y_true=y_val,
    y_pred_scores=val_probabilities,
    group_ids=val['id2'],
    k=7
)

print("\n--- Final Hackathon Score ---")
print(f"MAP@7 on the validation set: {final_map_score:.4f}")

  total_relevant_per_group = df[df['y_true'] == 1].groupby('group')['y_true'].count()
  for group_id, group_df in df.groupby('group'):



--- Final Hackathon Score ---
MAP@7 on the validation set: 0.4892
