In [2]:
import numpy as np

def clean_categorical(df):
    cat_cols = df.select_dtypes(include="object").columns

    for col in cat_cols:
        df[col] = df[col].astype(str).str.lower().str.strip()
        df[col] = df[col].replace("nan", np.nan)
        col_lower = col.lower()

        if col_lower == "time_of_day":
            df[col] = df[col].replace({
                r'^m[0o].*rning$': 'morning',
                r'^aftern?[0o].*n$': 'afternoon',
                r'^even.*g$': 'evening',
                }, regex=True)

        elif col_lower == 'payment_method':
            df[col] = df[col].replace({
                r'^cred.*$': 'credit',
                r'^cash$': 'cash',
                r'^pay[\s_]?pal$': 'paypal',
                r'^bank.*$': 'bank',
            }, regex=True)

        # Normalize Referral_Source
        elif col_lower == 'referral_source':
            df[col] = df[col].replace({
                r'^s[0o].*cial.*media$': 'social_media',
                r'^search.*engine$': 'search_engine',
                r'^ads$': 'ads',
                r'^email$': 'email',
                r'^direct$': 'direct',
            }, regex=True)
    return df


In [4]:
import pandas as pd
from sqlalchemy.engine import row

df = pd.read_csv('/Users/quentinvillet/oracles_of_paris/kaggle_competition/kaggle/train_dataset_M1_with_id.csv')

num_duplicates= df.duplicated().sum()

print(f"Number of fully duplicate rows: {num_duplicates}")

#Duplicate ids
num_duplicate_ids = df.duplicated(subset=["id"]).sum()

print(f"Number of fully duplicate rows: {num_duplicate_ids}")

#None rows are 90% similar
# from thefuzz import fuzz
#from tqdm import tqdm
#
#rows_as_text = df.head(10000).astype(str).agg("".join, axis=1)
#
#threshold = 90
#similar_pairs = []
#
#for i in tqdm(range(len(rows_as_text)), desc="Comapring rows"):
#    for j in range(i+1, len(rows_as_text)):
#        sim = fuzz.ratio(rows_as_text[i], rows_as_text[j])
#        if sim >= threshold:
#            similar_pairs.append((i,j,sim))
#
#print(f"{len(similar_pairs)} pairs of rows are ‚â•{threshold}% similar")


Number of fully duplicate rows: 0
Number of fully duplicate rows: 0


In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

def knn_impute_numeric(df, n_neighbors=5):
    df = df.copy()
    num_cols = df.select_dtypes(include=[np.number]).columns.drop("id")
    non_num_cols=df.select_dtypes(exclude=[np.number]).columns
    df_num = df[num_cols]

    imputer = KNNImputer(n_neighbors = n_neighbors, weights="distance")
    df_num_imputed = pd.DataFrame(imputer.fit_transform(df_num), columns=num_cols, index=df.index)
    df_imputed = pd.concat([df["id"],df_num_imputed, df[non_num_cols]], axis=1)

    return df_imputed


In [None]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

def minMax(df):

    num_cols=df.select_dtypes(include=[np.number]).columns.drop("id")
    scaler = MinMaxScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols])
    return df


In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

def cat_encoding(df):

    #Encode categorical variable to allow knn imputation.
    cat_columns = df.select_dtypes(exclude=["number"]).columns
    cat_columns = cat_columns.drop("Session_ID")
    cat_imputer = SimpleImputer(strategy="most_frequent")
    df[cat_columns] = cat_imputer.fit_transform(df[cat_columns])
    encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoded = encoder.fit_transform(df[cat_columns])
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(cat_columns), index=df.index)
    df_sklearn_encoded = pd.concat([df.drop(columns = cat_columns), one_hot_df], axis=1)

    return df_sklearn_encoded


In [11]:
import numpy as np
import pandas as pd


df = pd.read_csv('/Users/quentinvillet/oracles_of_paris/kaggle_competition/kaggle/train_dataset_M1_with_id.csv')

#Clean the data (I only clean the 3 columns maybe need to expand to the rest)
df = clean_categorical(df)

#Verify categorical was well cleaned
for col in ["Time_of_Day", "Payment_Method", "Referral_Source"]:
    print(f"Unique values for {col}: {df[col].dropna().unique()}")

#Encode categorical variable using one hot encoding
df = cat_encoding(df)

#Impute numerical, Need to add categorical once Encoded
df_imputed = knn_impute_numeric(df,n_neighbors=5)

#Make sure numerical was well split

print("Purchase value counts:")
print(df_imputed['Purchase'].value_counts())
print("\nPurchase proportions:")
print(df_imputed['Purchase'].value_counts(normalize=True))

num_cols = df.select_dtypes(include=[np.number]).columns.drop("id").tolist()

print("\nüìä Summary statistics for numeric columns after KNN imputation:")
print(df_imputed[num_cols].describe().T[['mean', 'std', 'min', 'max']])

missing = df_imputed[num_cols].isna().sum()
print("\n  Missing values remaining per numeric column:")
print(missing[missing > 0] if missing.sum() > 0 else "‚úÖ No missing values left!")

try:
    num_cols_orig = df.select_dtypes(include=[np.number]).columns
    comparison = pd.DataFrame({
        'mean_before': df[num_cols_orig].mean(),
        'mean_after': df_imputed[num_cols_orig].mean(),
        'std_before': df[num_cols_orig].std(),
        'std_after': df_imputed[num_cols_orig].std()
    })
    print("\nüìà Mean and Std before vs after imputation:")
    print(comparison)
except Exception as e:
    print("\n(‚ö†Ô∏è Skipping before/after comparison ‚Äì original df not available or mismatched columns.)")


#Save the csv
df_imputed.to_csv("df_imputed.csv")

#Return the df after minmax normalisation
clean_df = minMax(df_imputed)
print(clean_df.head())


Unique values for Time_of_Day: ['afternoon' 'morning' 'evening']
Unique values for Payment_Method: ['credit' 'cash' 'bank' 'paypal']
Unique values for Referral_Source: ['social_media' 'direct' 'search_engine' 'ads' 'email']
Purchase value counts:
0.0    8679
1.0    5056
Name: Purchase, dtype: int64

Purchase proportions:
0.0    0.631889
1.0    0.368111
Name: Purchase, dtype: float64

üìä Summary statistics for numeric columns after KNN imputation:
                                        mean         std     min           max
Age                                31.904318   12.157652  18.000     65.000000
Gender                              0.493861    0.496050   0.000      1.000000
Reviews_Read                        2.983246    1.708912   0.000     11.000000
Price                             560.345645  748.412434   5.127  14988.319963
Discount                           25.132523   14.818309   0.000     90.000000
Category                            1.998967    1.410946   0.000      4.0

In [12]:
clean_df

Unnamed: 0,id,Age,Gender,Reviews_Read,Price,Discount,Category,Items_In_Cart,Email_Interaction,Socioeconomic_Status_Score,...,PM_RS_Combo_credit:search_engine,PM_RS_Combo_credit:social_media,PM_RS_Combo_paypal:ads,PM_RS_Combo_paypal:direct,PM_RS_Combo_paypal:email,PM_RS_Combo_paypal:search_engine,PM_RS_Combo_paypal:social_media,Campaign_Period_false,Campaign_Period_true,Session_ID
0,1,0.397645,1.0,0.272727,0.039234,0.244444,0.25,0.214286,0.0,0.324978,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,s0000003
1,2,0.148936,1.0,0.090909,0.033781,0.133333,0.00,0.107143,1.0,0.371531,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,s0000005
2,3,0.085106,0.0,0.272727,0.014231,0.022222,0.25,0.142857,1.0,0.295882,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,s0000007
3,4,0.127660,0.0,0.272727,0.020600,0.011111,0.75,0.000000,1.0,0.470457,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,s0000008
4,5,0.361702,1.0,0.090909,0.032701,0.144444,0.00,0.071429,0.0,0.372874,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,s0000009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13730,13731,0.063830,1.0,0.272727,0.006686,0.288889,0.25,0.000000,1.0,0.274843,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,s0019994
13731,13732,0.063830,1.0,0.090909,0.013399,0.111111,0.75,0.071429,0.0,0.263653,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,s0019995
13732,13733,0.191489,0.0,0.272727,0.050601,0.466667,0.50,0.071429,0.0,0.175470,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,s0019996
13733,13734,0.191489,1.0,0.181818,0.018109,0.255556,0.25,0.178571,0.0,0.252910,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,s0019997


In [14]:
import numpy as np
import pandas as pd

X = clean_df.drop(columns=["Purchase", "id", "Session_ID"], errors='ignore').values.astype("float32")
y = clean_df["Purchase"].values.astype("float32")


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train).astype("float32")
X_test = scaler.transform(X_test).astype("float32")


In [16]:
import tensorflow as tf
from tensorflow.keras import layers, models

model = models.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(10, activation='relu'),
    layers.Dense(10, activation='relu'),
    layers.Dense(10, activation='relu'),
    layers.Dense(10, activation='relu'),
    layers.Dense(10, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 10)                510       
                                                                 
 dense_1 (Dense)             (None, 10)                110       
                                                                 
 dense_2 (Dense)             (None, 10)                110       
                                                                 
 dense_3 (Dense)             (None, 10)                110       
                                                                 
 dense_4 (Dense)             (None, 10)                110       
                                                                 
 dense_5 (Dense)             (None, 1)                 11        
                                                                 
Total params: 961
Trainable params: 961
Non-trainable pa

In [17]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=30,
    batch_size=64,
    verbose=1
)


Epoch 1/30


2025-11-24 21:15:29.464504: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [18]:
from sklearn.metrics import f1_score, classification_report

# Predict probabilities
y_pred_prob = model.predict(X_test)

# Convert to 0/1
y_pred = (y_pred_prob > 0.5).astype("int32")

# F1 Score
print("F1 Score:", f1_score(y_test, y_pred))

# Optional: full report
print(classification_report(y_test, y_pred))


F1 Score: 0.7628384687208216
              precision    recall  f1-score   support

         0.0       0.88      0.82      0.85      1736
         1.0       0.72      0.81      0.76      1011

    accuracy                           0.82      2747
   macro avg       0.80      0.81      0.81      2747
weighted avg       0.82      0.82      0.82      2747



In [19]:
df_test = pd.read_csv("/Users/quentinvillet/oracles_of_paris/kaggle_competition/kaggle/test_dataset_M1_with_id.csv")

print(df_test.shape)


(6265, 21)


In [21]:
import numpy as np
import pandas as pd

#Clean the data (I only clean the 3 columns maybe need to expand to the rest)
df_test = clean_categorical(df_test)

#Verify categorical was well cleaned
for col in ["Time_of_Day", "Payment_Method", "Referral_Source"]:
    print(f"Unique values for {col}: {df_test[col].dropna().unique()}")

#Encode categorical variable using one hot encoding
df_test = cat_encoding(df_test)

#Impute numerical, Need to add categorical once Encoded
df_imputed = knn_impute_numeric(df_test,n_neighbors=5)

#Make sure numerical was well split

print("Purchase value counts:")
print(df_imputed['Purchase'].value_counts())
print("\nPurchase proportions:")
print(df_imputed['Purchase'].value_counts(normalize=True))

num_cols = df_test.select_dtypes(include=[np.number]).columns.drop("id").tolist()

print("\nüìä Summary statistics for numeric columns after KNN imputation:")
print(df_imputed[num_cols].describe().T[['mean', 'std', 'min', 'max']])

missing = df_imputed[num_cols].isna().sum()
print("\n  Missing values remaining per numeric column:")
print(missing[missing > 0] if missing.sum() > 0 else "‚úÖ No missing values left!")

try:
    num_cols_orig = df_test.select_dtypes(include=[np.number]).columns
    comparison = pd.DataFrame({
        'mean_before': df_test[num_cols_orig].mean(),
        'mean_after': df_imputed[num_cols_orig].mean(),
        'std_before': df_test[num_cols_orig].std(),
        'std_after': df_imputed[num_cols_orig].std()
    })
    print("\nüìà Mean and Std before vs after imputation:")
    print(comparison)
except Exception as e:
    print("\n(‚ö†Ô∏è Skipping before/after comparison ‚Äì original df not available or mismatched columns.)")


#Save the csv
df_imputed.to_csv("df_imputed.csv")

#Return the df after minmax normalisation
df_test = minMax(df_imputed)
print(df_test.head())


Unique values for Time_of_Day: ['morning' 'evening' 'afternoon']
Unique values for Payment_Method: ['paypal' 'cash' 'bank' 'credit']
Unique values for Referral_Source: ['direct' 'search_engine' 'ads' 'social_media' 'email']
Purchase value counts:


KeyError: 'Purchase'

In [22]:
import pandas as pd
import numpy as np
import pickle
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

# -----------------------------------------
# LOAD TRAINING DATA
# -----------------------------------------
df = pd.read_csv("/Users/quentinvillet/oracles_of_paris/kaggle_competition/kaggle/train_dataset_M1_with_id.csv")

# -----------------------------------------
# CLEANING FUNCTION (same as your pipeline)
# -----------------------------------------
def clean_categorical(df):
    df = df.copy()
    cat_cols = df.select_dtypes(include="object").columns

    for col in cat_cols:
        df[col] = df[col].astype(str).str.lower().str.strip()
        df[col] = df[col].replace("nan", np.nan)
        col_lower = col.lower()

        if col_lower == "time_of_day":
            df[col] = df[col].replace({
                r'^m[0o].*rning$': 'morning',
                r'^aftern?[0o].*n$': 'afternoon',
                r'^even.*g$': 'evening',
            }, regex=True)

        elif col_lower == "payment_method":
            df[col] = df[col].replace({
                r'^cred.*$': 'credit',
                r'^cash$': 'cash',
                r'^pay[\s_]?pal$': 'paypal',
                r'^bank.*$': 'bank',
            }, regex=True)

        elif col_lower == "referral_source":
            df[col] = df[col].replace({
                r'^s[0o].*cial.*media$': 'social_media',
                r'^search.*engine$': 'search_engine',
                r'^ads$': 'ads',
                r'^email$': 'email',
                r'^direct$': 'direct',
            }, regex=True)

    return df

df = clean_categorical(df)

# -----------------------------------------
# SPLIT NUMERIC / CATEGORICAL
# -----------------------------------------
y = df["Purchase"]
df = df.drop(columns=["Purchase"])

cat_cols = df.select_dtypes(exclude=[np.number]).columns
num_cols = df.select_dtypes(include=[np.number]).columns.drop("id")

# -----------------------------------------
# FIT CATEGORICAL IMPUTER + ENCODER
# -----------------------------------------
cat_imputer = SimpleImputer(strategy="most_frequent")
df_cat = cat_imputer.fit_transform(df[cat_cols])

encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
df_cat_encoded = encoder.fit_transform(df_cat)

# -----------------------------------------
# FIT NUMERIC IMPUTER
# -----------------------------------------
imputer = KNNImputer(n_neighbors=5, weights="distance")
df_num_imputed = imputer.fit_transform(df[num_cols])

# -----------------------------------------
# COMBINE PROCESSED COLUMNS
# -----------------------------------------
df_processed = np.concatenate([df_num_imputed, df_cat_encoded], axis=1)

# -----------------------------------------
# FIT SCALER
# -----------------------------------------
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df_processed)

# -----------------------------------------
# SAVE TRANSFORMERS
# -----------------------------------------
pickle.dump(cat_imputer, open("cat_imputer.pkl", "wb"))
pickle.dump(encoder, open("encoder.pkl", "wb"))
pickle.dump(imputer, open("num_imputer.pkl", "wb"))
pickle.dump(scaler, open("scaler.pkl", "wb"))

# -----------------------------------------
# Save X and y for training NN
# -----------------------------------------
np.save("X_train.npy", df_scaled.astype("float32"))
np.save("y_train.npy", y.values.astype("float32"))

print("Training pipeline complete. Saved:")
print("- cat_imputer.pkl")
print("- encoder.pkl")
print("- num_imputer.pkl")
print("- scaler.pkl")
print("- X_train.npy, y_train.npy")


Training pipeline complete. Saved:
- cat_imputer.pkl
- encoder.pkl
- num_imputer.pkl
- scaler.pkl
- X_train.npy, y_train.npy


In [23]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models

# Load processed data
X = np.load("X_train.npy")
y = np.load("y_train.npy")

# Train/test split
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Build NN
model = models.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(64, activation="relu"),
    layers.Dense(32, activation="relu"),
    layers.Dense(16, activation="relu"),
    layers.Dense(8, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=30,
    batch_size=64
)

model.save("nn_model.keras")
print("Neural network trained and saved!")


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Neural network trained and saved!


In [24]:
import pandas as pd
import numpy as np
import pickle

# Load test data
df_test = pd.read_csv("/Users/quentinvillet/oracles_of_paris/kaggle_competition/kaggle/test_dataset_M1_with_id.csv")
ids = df_test["id"]

# CLEAN
df_test = clean_categorical(df_test)

# Load transformers
cat_imputer = pickle.load(open("cat_imputer.pkl", "rb"))
encoder = pickle.load(open("encoder.pkl", "rb"))
num_imputer = pickle.load(open("num_imputer.pkl", "rb"))
scaler = pickle.load(open("scaler.pkl", "rb"))

# Categorical
cat_cols_test = df_test.select_dtypes(exclude=[np.number]).columns
num_cols_test = df_test.select_dtypes(include=[np.number]).columns.drop("id")

df_cat = cat_imputer.transform(df_test[cat_cols_test])
df_cat_encoded = encoder.transform(df_cat)

# Numeric
df_num = num_imputer.transform(df_test[num_cols_test])

# Combine
df_test_processed = np.concatenate([df_num, df_cat_encoded], axis=1)

# Scale
df_test_scaled = scaler.transform(df_test_processed).astype("float32")


In [25]:
from tensorflow.keras.models import load_model

model = load_model("nn_model.keras")

preds = (model.predict(df_test_scaled) > 0.5).astype("int32").flatten()

submission = pd.DataFrame({
    "id": ids,
    "Purchase": preds
})

submission.to_csv("submission.csv", index=False)
print("Saved submission.csv!")


Saved submission.csv!


In [26]:
import os

print(os.listdir())


['scaler.pkl', 'Quentin', '.DS_Store', 'kaggle', 'encoder.pkl', 'requirements.txt', 'simple_model_testing.ipynb', 'submission.csv', 'time_series.ipynb', '__init__.py', 'model_test.ipynb', 'README.md', 'y_train.npy', 'df_imputed.csv', 'cat_imputer.pkl', 'model_test3.ipynb', '.gitignore', 'neural_net.ipynb', 'model_test2.ipynb', 'num_imputer.pkl', 'nn_model.keras', 'X_train.npy', '.git', 'data', 'notebooks']
