## This notebook is to test a random forest model

In [29]:
# %%
# Basic imports and path config
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')

# Filepath you provided (train)
TRAIN_PATH = "Quentin/dsba-m-1-challenge-purchase-prediction/train_dataset_M1_with_id.csv"
# (We will not load test for model selection — only for final prediction later if allowed)
# TEST_PATH = "/Users/quentinvillet/oracles_of_paris/kaggle_competition/Quentin/dsba-m-1-challenge-purchase-prediction/test_dataset_M1_with_id.csv"

RND = 42
print("Ready. Train path:", TRAIN_PATH)


Ready. Train path: Quentin/dsba-m-1-challenge-purchase-prediction/train_dataset_M1_with_id.csv


In [30]:
# %%
df = pd.read_csv(TRAIN_PATH)
print("Original rows:", len(df))
# enforce only days <= 70 (competition rule)
df = df[df['Day'] <= 70].reset_index(drop=True)
print("After filtering to days ≤70 rows:", len(df))

# drop columns you already removed (repeat safe drop if needed)
drop_cols = ['id', 'Session_ID', 'PM_from_combo', 'RS_from_combo', 'Items_In_Cart_raw']
for c in drop_cols:
    if c in df.columns:
        df.drop(columns=c, inplace=True)

# quick look
print(df.shape)
display(df.head())


Original rows: 13735
After filtering to days ≤70 rows: 13735
(13735, 20)


Unnamed: 0,Age,Gender,Reviews_Read,Price,Discount,Category,Items_In_Cart,Time_of_Day,Email_Interaction,Device_Type,Payment_Method,Referral_Source,Socioeconomic_Status_Score,Engagement_Score,AB_Bucket,Price_Sine,PM_RS_Combo,Day,Campaign_Period,Purchase
0,,1.0,3.0,592.975,22.0,1.0,6.0,afterno0n,0.0,Mobile,Credit,,7.26,1.85652,3.0,0.999047,Credit:Social_media,59,False,0
1,25.0,1.0,1.0,511.279,12.0,0.0,3.0,morning,1.0,Tablet,Cash,Social_media,8.3,1.868138,5.0,-0.129689,Cash:Social_media,29,True,1
2,22.0,0.0,3.0,218.36,2.0,1.0,4.0,evening,1.0,Mobile,Bank,Social_media,6.61,1.223445,0.0,-0.421646,Bank:Social_media,16,False,0
3,24.0,0.0,3.0,313.781,1.0,3.0,0.0,evening,1.0,Mobile,pay pal,Social_media,10.51,0.359684,1.0,-0.988239,,53,False,0
4,35.0,1.0,1.0,495.088,13.0,0.0,2.0,evening,0.0,Mobile,Cash,Social_media,8.33,3.84858,2.0,0.695737,Cash:Social_media,10,False,0


In [31]:
# %%
def clean_strings(df):
    df = df.copy()
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].astype(str).str.lower().str.strip()
        df[col] = df[col].replace({'nan': np.nan, 'none': np.nan, '': np.nan})
    # Time_of_Day variants -> morning / afternoon / evening
    if 'Time_of_Day' in df.columns:
        tod = df['Time_of_Day'].astype(str).str.lower()
        tod = tod.str.replace(r'[^a-z]', '', regex=True)
        tod = tod.replace({
            r'^morn.*': 'morning',
            r'^aft.*': 'afternoon',
            r'^even.*': 'evening',
            r'^nig.*': 'evening',
            r'^mid.*': 'afternoon'
        }, regex=True)
        df['Time_of_Day'] = tod
    # Device_Type heuristics
    if 'Device_Type' in df.columns:
        dev = df['Device_Type'].astype(str).str.replace(r'[^a-z]', '', regex=True)
        dev = dev.replace({
            r'^phone.*': 'mobile',
            r'^mob.*': 'mobile',
            r'^lap.*': 'desktop',
            r'^desk.*': 'desktop',
            r'^tab.*': 'tablet'
        }, regex=True)
        df['Device_Type'] = dev
    # Payment method normalization
    if 'Payment_Method' in df.columns:
        pm = df['Payment_Method'].astype(str)
        pm = pm.replace({
            r'^cred.*': 'credit',
            r'^debit.*': 'debit',
            r'^pay[\s_]?pal$': 'paypal',
            r'^cash$': 'cash',
            r'^bank.*': 'bank'
        }, regex=True)
        df['Payment_Method'] = pm
    # Referral source
    if 'Referral_Source' in df.columns:
        rs = df['Referral_Source'].astype(str)
        rs = rs.replace({
            r'social.*': 'social_media',
            r'search.*': 'search_engine',
            r'^ads?$': 'ads',
            r'^email$': 'email',
            r'^direct$': 'direct'
        }, regex=True)
        df['Referral_Source'] = rs
    return df

df = clean_strings(df)
print("Unique Time_of_Day values:", df['Time_of_Day'].dropna().unique())


Unique Time_of_Day values: ['afternoon' 'morning' 'evening' 'nan' 'mrning']


In [32]:
# %%
def fe_price_items(df):
    df = df.copy()
    # ensure numeric
    df['Price'] = pd.to_numeric(df['Price'], errors='coerce').fillna(0)
    df['Discount'] = pd.to_numeric(df['Discount'], errors='coerce').fillna(0)
    # derived
    df['Effective_Price'] = df['Price'] * (1 - df['Discount']/100.0)
    df['Discount_Amount'] = df['Price'] * df['Discount'] / 100.0
    df['Has_Discount'] = (df['Discount'] > 0).astype(int)
    df['High_Discount'] = (df['Discount'] > 30).astype(int)
    # price category bins (tweak thresholds if needed)
    df['Price_Category'] = pd.cut(df['Price'], bins=[-1, 50, 200, 500, 100000], labels=['Low','Medium','High','Premium'])
    # Items in cart: safe buckets
    df['Items_In_Cart'] = pd.to_numeric(df['Items_In_Cart'], errors='coerce').fillna(0)
    df['Has_Items'] = (df['Items_In_Cart'] > 0).astype(int)
    df['Items_In_Cart_bucket'] = pd.cut(df['Items_In_Cart'], bins=[-1,0,1,2,4,10,np.inf], labels=['0','1','2','3-4','5-10','10+'])
    return df

df = fe_price_items(df)
display(df[['Price','Discount','Effective_Price','Has_Discount','High_Discount','Price_Category','Items_In_Cart','Items_In_Cart_bucket']].head())


Unnamed: 0,Price,Discount,Effective_Price,Has_Discount,High_Discount,Price_Category,Items_In_Cart,Items_In_Cart_bucket
0,592.975,22.0,462.5205,1,0,Premium,6.0,5-10
1,511.279,12.0,449.92552,1,0,Premium,3.0,3-4
2,218.36,2.0,213.9928,1,0,High,4.0,3-4
3,313.781,1.0,310.64319,1,0,High,0.0,0
4,495.088,13.0,430.72656,1,0,High,2.0,2


In [33]:
# %%
if 'PM_RS_Combo' in df.columns:
    mask = df['PM_RS_Combo'].notna() & ((df['Payment_Method'].isna()) | (df['Referral_Source'].isna()))
    if mask.any():
        split = df.loc[mask, 'PM_RS_Combo'].str.split(':', expand=True)
        df.loc[mask, 'Payment_Method'] = df.loc[mask, 'Payment_Method'].fillna(split[0])
        df.loc[mask, 'Referral_Source'] = df.loc[mask, 'Referral_Source'].fillna(split[1])
    # drop original combo (avoid duplication)
    df.drop(columns=['PM_RS_Combo'], inplace=True, errors='ignore')
    print("Parsed PM_RS_Combo where applicable.")
else:
    print("No PM_RS_Combo column found; skipped.")


Parsed PM_RS_Combo where applicable.


In [34]:
# %%
# Final recommended feature list (safe)
final_features = [
    'Age', 'Gender', 'Category', 'Price', 'Discount', 'Reviews_Read',
    'Items_In_Cart', 'Items_In_Cart_bucket', 'Has_Items',
    'Time_of_Day', 'Email_Interaction', 'Device_Type', 'Referral_Source',
    'Socioeconomic_Status_Score', 'Engagement_Score',
    'Has_Discount', 'High_Discount', 'Price_Category'
]

# Drop any features not in df (safe)
final_features = [f for f in final_features if f in df.columns]

# Optionally drop features we don't want
to_drop = [c for c in df.columns if c not in final_features + ['Purchase','Day']]
print("Dropping these columns (to reduce noise):", to_drop[:50])
df = df.drop(columns=to_drop)
print("Remaining columns:", df.columns.tolist())


Dropping these columns (to reduce noise): ['Payment_Method', 'AB_Bucket', 'Price_Sine', 'Campaign_Period', 'Effective_Price', 'Discount_Amount']
Remaining columns: ['Age', 'Gender', 'Reviews_Read', 'Price', 'Discount', 'Category', 'Items_In_Cart', 'Time_of_Day', 'Email_Interaction', 'Device_Type', 'Referral_Source', 'Socioeconomic_Status_Score', 'Engagement_Score', 'Day', 'Purchase', 'Has_Discount', 'High_Discount', 'Price_Category', 'Has_Items', 'Items_In_Cart_bucket']


In [35]:
# %%
print("Counts by Items_In_Cart (first 20):")
print(df['Items_In_Cart'].value_counts().sort_index().head(20))

print("\nPurchase rate by Items_In_Cart (first 20):")
display(df.groupby('Items_In_Cart')['Purchase'].agg(['mean','count']).sort_index().head(20))


Counts by Items_In_Cart (first 20):
0.0     1501
1.0     2312
2.0     2410
3.0     2046
4.0     1546
5.0     1243
6.0      837
7.0      578
8.0      430
9.0      273
10.0     191
11.0     120
12.0      83
13.0      60
14.0      33
15.0      33
16.0      12
17.0      10
18.0       7
19.0       4
Name: Items_In_Cart, dtype: int64

Purchase rate by Items_In_Cart (first 20):


Unnamed: 0_level_0,mean,count
Items_In_Cart,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.769487,1501
1.0,0.695934,2312
2.0,0.508299,2410
3.0,0.347507,2046
4.0,0.161061,1546
5.0,0.065969,1243
6.0,0.0227,837
7.0,0.00692,578
8.0,0.002326,430
9.0,0.003663,273


In [36]:
cols_to_drop = ['Items_In_Cart', 'Discount', 'High_Discount', 'Price_Category']
df.drop(columns=[c for c in cols_to_drop if c in df.columns], inplace=True)


In [37]:
final_features = [
    'Age',
    'Gender',
    'Reviews_Read',
    'Price',
    'Category',
    'Time_of_Day',
    'Email_Interaction',
    'Device_Type',
    'Referral_Source',
    'Socioeconomic_Status_Score',
    'Engagement_Score',
    'Has_Discount',
    'Has_Items',
    'Items_In_Cart_bucket'
]


In [None]:
# %%
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import pandas as pd
import numpy as np

# FINAL FEATURE LIST
final_features = [
    'Age',
    'Gender',
    'Reviews_Read',
    'Price',
    'Category',
    'Time_of_Day',
    'Email_Interaction',
    'Device_Type',
    'Referral_Source',
    'Socioeconomic_Status_Score',
    'Engagement_Score',
    'Has_Discount',
    'Has_Items',
    'Items_In_Cart_bucket'
]

# CATEGORICAL / NUMERIC SPLIT
cat_cols = [c for c in final_features if df[c].dtype == 'object' or str(df[c].dtype).startswith('category')]
num_cols = [c for c in final_features if c not in cat_cols]

print("Categorical columns:", cat_cols)
print("Numeric columns:", num_cols)


# ---- PREPROCESSOR CLASS ----

class Preprocessor:
    def __init__(self, cat_cols, num_cols, n_neighbors=5):
        self.cat_cols = cat_cols
        self.num_cols = num_cols
        self.n_neighbors = n_neighbors

        self.cat_imputer = None
        self.ohe = None
        self.knn = None
        self.scaler = None
        self.ohe_cols = None

    def fit(self, X):
        # Impute categoricals
        self.cat_imputer = SimpleImputer(strategy='most_frequent')
        X_cat = self.cat_imputer.fit_transform(X[self.cat_cols])

        # One-hot encode
        self.ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        arr = self.ohe.fit_transform(X_cat)
        self.ohe_cols = list(self.ohe.get_feature_names_out(self.cat_cols))

        # Prepare for KNN-imputation
        X_num = X[self.num_cols].copy()
        X_knn_fit = pd.concat(
            [pd.DataFrame(arr, index=X.index, columns=self.ohe_cols),
             X_num],
            axis=1
        )

        # Fit KNN-imputer
        self.knn = KNNImputer(n_neighbors=self.n_neighbors, weights='distance')
        self.knn.fit(X_knn_fit)

        # Fit scaler
        X_knn_imp = pd.DataFrame(
            self.knn.transform(X_knn_fit),
            columns=X_knn_fit.columns,
            index=X.index
        )

        self.scaler = MinMaxScaler()
        self.scaler.fit(X_knn_imp)

        return self

    def transform(self, X):
        X = X.copy()

        # Process categoricals
        X_cat = self.cat_imputer.transform(X[self.cat_cols])
        arr = self.ohe.transform(X_cat)
        df_ohe = pd.DataFrame(arr, index=X.index, columns=self.ohe_cols)

        # Process numericals
        X_num = X[self.num_cols].copy()

        # Combine for final imputing
        X_knn = pd.concat([df_ohe, X_num], axis=1)

        # Order columns
        X_knn = X_knn[self.ohe_cols + self.num_cols]

        # Apply KNN imputation
        X_imp = pd.DataFrame(
            self.knn.transform(X_knn),
            columns=X_knn.columns,
            index=X.index
        )

        # Scale
        X_scaled = pd.DataFrame(
            self.scaler.transform(X_imp),
            columns=X_imp.columns,
            index=X.index
        )

        return X_scaled


Categorical columns: ['Time_of_Day', 'Device_Type', 'Referral_Source', 'Items_In_Cart_bucket']
Numeric columns: ['Age', 'Gender', 'Reviews_Read', 'Price', 'Category', 'Email_Interaction', 'Socioeconomic_Status_Score', 'Engagement_Score', 'Has_Discount', 'Has_Items']


In [39]:
train_df = df[df['Day'] <= 70].copy()
valid_df = df[df['Day'] > 70].copy()

X_train = train_df[final_features].copy()
y_train = train_df['Purchase'].copy()

X_valid = valid_df[final_features].copy()
y_valid = valid_df['Purchase'].copy()


In [40]:
PP = Preprocessor(cat_cols, num_cols)
PP.fit(X_train)

X_train_prep = PP.transform(X_train)
X_valid_prep = PP.transform(X_valid)


ValueError: Found array with 0 sample(s) (shape=(0, 4)) while a minimum of 1 is required by SimpleImputer.

In [41]:
print(df['Day'].max())
print(df['Day'].min())


70
1


In [42]:
# %%
# Imports
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# Load the train dataset
df = pd.read_csv('Quentin/dsba-m-1-challenge-purchase-prediction/train_dataset_M1_with_id.csv')

# %%
# Columns to drop
drop_cols = ['id', 'Session_ID', 'PM_from_combo', 'RS_from_combo', 'Items_In_Cart']  # dropped Items_In_Cart
for c in drop_cols:
    if c in df.columns:
        df.drop(columns=c, inplace=True)

# Final features to use (everything except target and day)
target = 'Purchase'
final_features = [c for c in df.columns if c != target and c != 'Day']
print("Feature candidates:", final_features)

# %%
# Define categorical and numeric columns
cat_cols = [c for c in final_features if df[c].dtype == 'object' or str(df[c].dtype).startswith('category')]
num_cols = [c for c in final_features if c not in cat_cols]

print("Categorical cols:", cat_cols)
print("Numeric cols:", num_cols)

# %%
# Preprocessor class
class Preprocessor:
    def __init__(self, cat_cols, num_cols, n_neighbors=5):
        self.cat_cols = cat_cols
        self.num_cols = num_cols
        self.n_neighbors = n_neighbors
        self.cat_imputer = None
        self.ohe = None
        self.knn = None
        self.scaler = None
        self.ohe_cols = None

    def fit(self, X):
        # Categorical imputation
        self.cat_imputer = SimpleImputer(strategy='most_frequent')
        X_cat = X[self.cat_cols].copy()
        X_cat = self.cat_imputer.fit_transform(X_cat)

        # One-hot encoding
        self.ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        arr = self.ohe.fit_transform(X_cat)
        self.ohe_cols = list(self.ohe.get_feature_names_out(self.cat_cols))

        # Numeric imputation using KNN (with OHE included)
        X_num = X[self.num_cols].copy()
        X_knn_fit = pd.concat([pd.DataFrame(arr, index=X.index, columns=self.ohe_cols), X_num], axis=1)
        self.knn = KNNImputer(n_neighbors=self.n_neighbors, weights='distance')
        self.knn.fit(X_knn_fit)

        # Scaler
        X_knn_imp = pd.DataFrame(self.knn.transform(X_knn_fit), columns=X_knn_fit.columns, index=X.index)
        self.scaler = MinMaxScaler()
        self.scaler.fit(X_knn_imp)
        return self

    def transform(self, X):
        X = X.copy()
        X_cat = X[self.cat_cols].copy()
        X_cat = self.cat_imputer.transform(X_cat)
        arr = self.ohe.transform(X_cat)
        df_ohe = pd.DataFrame(arr, index=X.index, columns=self.ohe_cols)

        X_num = X[self.num_cols].copy()
        X_knn = pd.concat([df_ohe, X_num], axis=1)

        # Ensure all OHE columns exist
        for c in [c for c in self.ohe_cols if c not in X_knn.columns]:
            X_knn[c] = 0.0

        X_knn = X_knn[self.ohe_cols + self.num_cols]
        X_imp = pd.DataFrame(self.knn.transform(X_knn), columns=X_knn.columns, index=X.index)
        X_scaled = pd.DataFrame(self.scaler.transform(X_imp), columns=X_imp.columns, index=X.index)
        return X_scaled

# %%
# Separate features and target
X = df[final_features]
y = df[target]

# Fit preprocessor
PP = Preprocessor(cat_cols, num_cols)
PP.fit(X)

# Transform features
X_prep = PP.transform(X)

print("Preprocessing done. Shape:", X_prep.shape)

# %%
# Random Forest
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=3,
    random_state=42,
    n_jobs=-1
)

# %%
# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_scores = cross_val_score(rf, X_prep, y, cv=cv, scoring='accuracy')
f1_scores = cross_val_score(rf, X_prep, y, cv=cv, scoring='f1')
roc_scores = cross_val_score(rf, X_prep, y, cv=cv, scoring='roc_auc')

print(f"Random Forest 5-fold CV Accuracy: {acc_scores.mean():.4f} ± {acc_scores.std():.4f}")
print(f"Random Forest 5-fold CV F1-score: {f1_scores.mean():.4f} ± {f1_scores.std():.4f}")
print(f"Random Forest 5-fold CV ROC-AUC: {roc_scores.mean():.4f} ± {roc_scores.std():.4f}")

# %%
# Fit final model on all data
rf.fit(X_prep, y)
print("Random Forest trained on full dataset.")

# %%
# Feature importances
importances = pd.DataFrame({
    'feature': X_prep.columns,
    'importance': rf.feature_importances_
}).sort_values(by='importance', ascending=False)

print(importances.head(20))


Feature candidates: ['Age', 'Gender', 'Reviews_Read', 'Price', 'Discount', 'Category', 'Time_of_Day', 'Email_Interaction', 'Device_Type', 'Payment_Method', 'Referral_Source', 'Socioeconomic_Status_Score', 'Engagement_Score', 'AB_Bucket', 'Price_Sine', 'PM_RS_Combo', 'Campaign_Period']
Categorical cols: ['Time_of_Day', 'Device_Type', 'Payment_Method', 'Referral_Source', 'PM_RS_Combo', 'Campaign_Period']
Numeric cols: ['Age', 'Gender', 'Reviews_Read', 'Price', 'Discount', 'Category', 'Email_Interaction', 'Socioeconomic_Status_Score', 'Engagement_Score', 'AB_Bucket', 'Price_Sine']
Preprocessing done. Shape: (13735, 173)
Random Forest 5-fold CV Accuracy: 0.6823 ± 0.0048
Random Forest 5-fold CV F1-score: 0.3648 ± 0.0136
Random Forest 5-fold CV ROC-AUC: 0.7230 ± 0.0103
Random Forest trained on full dataset.
                           feature  importance
165                          Price    0.138072
170               Engagement_Score    0.123536
172                     Price_Sine    0.075595