In [6]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.1-py3-none-macosx_12_0_arm64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m24.7 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.1.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [11]:
!pip install LGBMClassifier

[31mERROR: Could not find a version that satisfies the requirement LGBMClassifier (from versions: none)[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[31mERROR: No matching distribution found for LGBMClassifier[0m[31m
[0m

In [10]:
# %%
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.inspection import permutation_importance
from xgboost import XGBClassifier

# -----------------------------
# 1️⃣ Load data
# -----------------------------
df = pd.read_csv('Quentin/dsba-m-1-challenge-purchase-prediction/train_dataset_M1_with_id.csv')

# Drop useless columns
df.drop(columns=['id', 'Session_ID'], inplace=True, errors='ignore')

# -----------------------------
# 2️⃣ Feature engineering
# -----------------------------
# Binary discount flag
df['Has_Discount'] = (df['Discount'] > 0).astype(int)

# Effective price
df['Effective_Price'] = df['Price'] * (1 - df['Discount']/100)

# Discount amount (duplicate of Discount, but can keep for model)
df['Discount_Amount'] = df['Discount']

# Price category (Low / Medium / Premium)
df['Price_Category'] = pd.cut(df['Price'], bins=[0, 250, 500, np.inf], labels=['Low', 'Medium', 'Premium'])

# Optional: simple binned features to reduce column explosion
df['Engagement_bin'] = pd.qcut(df['Engagement_Score'], q=3, labels=False)
df['Price_bin'] = pd.qcut(df['Effective_Price'], q=3, labels=False)

# -----------------------------
# 3️⃣ Select features
# -----------------------------
target = 'Purchase'

# Top numeric features based on previous importance + engineered
numeric_cols = [
    'Engagement_Score', 'Reviews_Read', 'Price', 'Discount', 'Category',
    'Email_Interaction', 'Socioeconomic_Status_Score', 'Effective_Price',
    'Discount_Amount', 'Has_Discount', 'Engagement_bin', 'Price_bin'
]

# Reduce OHE explosion: keep only important categorical features
categorical_cols = ['Time_of_Day', 'Device_Type', 'Payment_Method', 'Referral_Source', 'Price_Category', 'Campaign_Period']

features = numeric_cols + categorical_cols
X = df[features]
y = df[target]

# -----------------------------
# 4️⃣ Preprocessing
# -----------------------------
# Numeric KNN imputer
knn = KNNImputer(n_neighbors=5, weights='distance')
X[numeric_cols] = knn.fit_transform(X[numeric_cols])

# Categorical: fill missing and label encode to reduce OHE columns
label_encoders = {}
for col in categorical_cols:
    X[col] = X[col].fillna(X[col].mode()[0])
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

# Scale numeric features
scaler = MinMaxScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

print("Final feature shape:", X.shape)

# -----------------------------
# 5️⃣ XGBoost + hyperparameter tuning
# -----------------------------
xgb = XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    use_label_encoder=False,
    n_jobs=-1,
    random_state=42
)

param_dist = {
    'n_estimators': [300, 400],
    'max_depth': [3, 4],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'min_child_weight': [1, 5]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    xgb,
    param_distributions=param_dist,
    n_iter=30,
    scoring='roc_auc',
    cv=cv,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

search.fit(X, y)
best_model = search.best_estimator_
print("Best ROC-AUC:", search.best_score_)
print("Best params:", search.best_params_)

# -----------------------------
# 6️⃣ Evaluate with cross-validation
# -----------------------------
accuracy = cross_val_score(best_model, X, y, cv=cv, scoring='accuracy').mean()
f1 = cross_val_score(best_model, X, y, cv=cv, scoring='f1').mean()
roc_auc_val = cross_val_score(best_model, X, y, cv=cv, scoring='roc_auc').mean()

print(f"CV Accuracy: {accuracy:.4f}")
print(f"CV F1-score: {f1:.4f}")
print(f"CV ROC-AUC: {roc_auc_val:.4f}")

# -----------------------------
# 7️⃣ Feature importance (permutation)
# -----------------------------
perm_imp = permutation_importance(best_model, X, y, n_repeats=10, random_state=42, n_jobs=-1)
feat_imp = pd.DataFrame({'feature': X.columns, 'importance': perm_imp.importances_mean}).sort_values('importance', ascending=False)
print(feat_imp.head(20))


Final feature shape: (13735, 18)
Fitting 5 folds for each of 30 candidates, totalling 150 fits


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numeric_cols] = knn.fit_transform(X[numeric_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].fillna(X[col].mode()[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using 

Best ROC-AUC: 0.7335456506213619
Best params: {'subsample': 0.7, 'n_estimators': 400, 'min_child_weight': 1, 'max_depth': 4, 'learning_rate': 0.01, 'colsample_bytree': 1.0}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


CV Accuracy: 0.6968
CV F1-score: 0.4808
CV ROC-AUC: 0.7335
                       feature  importance
0             Engagement_Score    0.046407
5            Email_Interaction    0.025380
1                 Reviews_Read    0.022184
17             Campaign_Period    0.017139
2                        Price    0.013600
7              Effective_Price    0.012654
4                     Category    0.010164
13                 Device_Type    0.005555
6   Socioeconomic_Status_Score    0.003393
3                     Discount    0.001165
10              Engagement_bin    0.001129
15             Referral_Source    0.000619
11                   Price_bin    0.000248
9                 Has_Discount    0.000233
14              Payment_Method    0.000175
12                 Time_of_Day    0.000058
8              Discount_Amount    0.000000
16              Price_Category   -0.000306


In [13]:
# %%
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.inspection import permutation_importance
from xgboost import XGBClassifier

# -----------------------------
# 1️⃣ Load data
# -----------------------------
df = pd.read_csv('Quentin/dsba-m-1-challenge-purchase-prediction/train_dataset_M1_with_id.csv')

# Drop useless columns
df.drop(columns=['id', 'Session_ID'], inplace=True, errors='ignore')

# -----------------------------
# 2️⃣ Feature engineering
# -----------------------------
# Binary discount flag
df['Has_Discount'] = (df['Discount'] > 0).astype(int)

# Effective price
df['Effective_Price'] = df['Price'] * (1 - df['Discount']/100)

# Discount amount (duplicate of Discount, but can keep for model)
df['Discount_Amount'] = df['Discount']

# Price category (Low / Medium / Premium)
df['Price_Category'] = pd.cut(df['Price'], bins=[0, 250, 500, np.inf], labels=['Low', 'Medium', 'Premium'])

# Optional: simple binned features to reduce column explosion
df['Engagement_bin'] = pd.qcut(df['Engagement_Score'], q=3, labels=False)
df['Price_bin'] = pd.qcut(df['Effective_Price'], q=3, labels=False)

# -----------------------------
# 3️⃣ Select features
# -----------------------------
target = 'Purchase'

# Top numeric features based on previous importance + engineered
numeric_cols = [
    'Engagement_Score', 'Reviews_Read', 'Price', 'Discount', 'Category',
    'Email_Interaction', 'Socioeconomic_Status_Score', 'Effective_Price',
    'Discount_Amount', 'Has_Discount', 'Engagement_bin', 'Price_bin'
]

# Reduce OHE explosion: keep only important categorical features
categorical_cols = ['Time_of_Day', 'Device_Type', 'Payment_Method', 'Referral_Source', 'Price_Category', 'Campaign_Period']

features = numeric_cols + categorical_cols
X = df[features]
y = df[target]

# -----------------------------
# 4️⃣ Preprocessing
# -----------------------------
# Numeric KNN imputer
knn = KNNImputer(n_neighbors=5, weights='distance')
X[numeric_cols] = knn.fit_transform(X[numeric_cols])

# Categorical: fill missing and label encode to reduce OHE columns
label_encoders = {}
for col in categorical_cols:
    X[col] = X[col].fillna(X[col].mode()[0])
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

# Scale numeric features
scaler = MinMaxScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

print("Final feature shape:", X.shape)

# -----------------------------
# 5️⃣ XGBoost + hyperparameter tuning
# -----------------------------
xgb = XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    use_label_encoder=False,
    n_jobs=-1,
    random_state=42
)

param_dist = {
    'n_estimators': [300, 400],
    'max_depth': [3, 4],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'min_child_weight': [1, 5]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    xgb,
    param_distributions=param_dist,
    n_iter=30,
    scoring='roc_auc',
    cv=cv,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

search.fit(X, y)
best_model = search.best_estimator_
print("Best ROC-AUC:", search.best_score_)
print("Best params:", search.best_params_)

# -----------------------------
# 6️⃣ Evaluate with cross-validation
# -----------------------------
accuracy = cross_val_score(best_model, X, y, cv=cv, scoring='accuracy').mean()
f1 = cross_val_score(best_model, X, y, cv=cv, scoring='f1').mean()
roc_auc_val = cross_val_score(best_model, X, y, cv=cv, scoring='roc_auc').mean()

print(f"CV Accuracy: {accuracy:.4f}")
print(f"CV F1-score: {f1:.4f}")
print(f"CV ROC-AUC: {roc_auc_val:.4f}")

# -----------------------------
# 7️⃣ Feature importance (permutation)
# -----------------------------
perm_imp = permutation_importance(best_model, X, y, n_repeats=10, random_state=42, n_jobs=-1)
feat_imp = pd.DataFrame({'feature': X.columns, 'importance': perm_imp.importances_mean}).sort_values('importance', ascending=False)
print(feat_imp.head(20))


Final feature shape: (13735, 18)
Fitting 5 folds for each of 30 candidates, totalling 150 fits


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numeric_cols] = knn.fit_transform(X[numeric_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].fillna(X[col].mode()[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using 

Best ROC-AUC: 0.7335456506213619
Best params: {'subsample': 0.7, 'n_estimators': 400, 'min_child_weight': 1, 'max_depth': 4, 'learning_rate': 0.01, 'colsample_bytree': 1.0}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


CV Accuracy: 0.6968
CV F1-score: 0.4808
CV ROC-AUC: 0.7335
                       feature  importance
0             Engagement_Score    0.046407
5            Email_Interaction    0.025380
1                 Reviews_Read    0.022184
17             Campaign_Period    0.017139
2                        Price    0.013600
7              Effective_Price    0.012654
4                     Category    0.010164
13                 Device_Type    0.005555
6   Socioeconomic_Status_Score    0.003393
3                     Discount    0.001165
10              Engagement_bin    0.001129
15             Referral_Source    0.000619
11                   Price_bin    0.000248
9                 Has_Discount    0.000233
14              Payment_Method    0.000175
12                 Time_of_Day    0.000058
8              Discount_Amount    0.000000
16              Price_Category   -0.000306
