In [19]:
import numpy as np

def clean_categorical(df):
    cat_cols = df.select_dtypes(include="object").columns

    for col in cat_cols:
        df[col] = df[col].astype(str).str.lower().str.strip()
        df[col] = df[col].replace("nan", np.nan)
        col_lower = col.lower()

        if col_lower == "time_of_day":
            df[col] = df[col].replace({
                r'^m[0o].*rning$': 'morning',
                r'^aftern?[0o].*n$': 'afternoon',
                r'^even.*g$': 'evening',
                }, regex=True)

        elif col_lower == 'payment_method':
            df[col] = df[col].replace({
                r'^cred.*$': 'credit',
                r'^cash$': 'cash',
                r'^pay[\s_]?pal$': 'paypal',
                r'^bank.*$': 'bank',
            }, regex=True)

        # Normalize Referral_Source
        elif col_lower == 'referral_source':
            df[col] = df[col].replace({
                r'^s[0o].*cial.*media$': 'social_media',
                r'^search.*engine$': 'search_engine',
                r'^ads$': 'ads',
                r'^email$': 'email',
                r'^direct$': 'direct',
            }, regex=True)
    return df


In [20]:
import pandas as pd
from sqlalchemy.engine import row

df = pd.read_csv('/Users/quentinvillet/oracles_of_paris/kaggle_competition/kaggle/train_dataset_M1_with_id.csv')

num_duplicates= df.duplicated().sum()

print(f"Number of fully duplicate rows: {num_duplicates}")

#Duplicate ids
num_duplicate_ids = df.duplicated(subset=["id"]).sum()

print(f"Number of fully duplicate rows: {num_duplicate_ids}")

#None rows are 90% similar
# from thefuzz import fuzz
#from tqdm import tqdm
#
#rows_as_text = df.head(10000).astype(str).agg("".join, axis=1)
#
#threshold = 90
#similar_pairs = []
#
#for i in tqdm(range(len(rows_as_text)), desc="Comapring rows"):
#    for j in range(i+1, len(rows_as_text)):
#        sim = fuzz.ratio(rows_as_text[i], rows_as_text[j])
#        if sim >= threshold:
#            similar_pairs.append((i,j,sim))
#
#print(f"{len(similar_pairs)} pairs of rows are ‚â•{threshold}% similar")


Number of fully duplicate rows: 0
Number of fully duplicate rows: 0


In [21]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

def knn_impute_numeric(df, n_neighbors=5):
    df = df.copy()
    num_cols = df.select_dtypes(include=[np.number]).columns.drop("id")
    non_num_cols=df.select_dtypes(exclude=[np.number]).columns
    df_num = df[num_cols]

    imputer = KNNImputer(n_neighbors = n_neighbors, weights="distance")
    df_num_imputed = pd.DataFrame(imputer.fit_transform(df_num), columns=num_cols, index=df.index)
    df_imputed = pd.concat([df["id"],df_num_imputed, df[non_num_cols]], axis=1)

    return df_imputed


In [22]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

def minMax(df):

    num_cols=df.select_dtypes(include=[np.number]).columns.drop("id")
    scaler = MinMaxScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols])
    return df


In [23]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

def cat_encoding(df):

    #Encode categorical variable to allow knn imputation.
    cat_columns = df.select_dtypes(exclude=["number"]).columns
    cat_columns = cat_columns.drop("Session_ID")
    cat_imputer = SimpleImputer(strategy="most_frequent")
    df[cat_columns] = cat_imputer.fit_transform(df[cat_columns])
    encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoded = encoder.fit_transform(df[cat_columns])
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(cat_columns), index=df.index)
    df_sklearn_encoded = pd.concat([df.drop(columns = cat_columns), one_hot_df], axis=1)

    return df_sklearn_encoded


In [24]:
import numpy as np
import pandas as pd

from knn_imputation import knn_impute_numeric
from cleaning import clean_categorical
from one_hot import cat_encoding
from minmax import minMax


df = pd.read_csv('/Users/quentinvillet/oracles_of_paris/kaggle_competition/kaggle/train_dataset_M1_with_id.csv')

#Clean the data (I only clean the 3 columns maybe need to expand to the rest)
df = clean_categorical(df)

#Verify categorical was well cleaned
for col in ["Time_of_Day", "Payment_Method", "Referral_Source"]:
    print(f"Unique values for {col}: {df[col].dropna().unique()}")

#Encode categorical variable using one hot encoding
df = cat_encoding(df)

#Impute numerical, Need to add categorical once Encoded
df_imputed = knn_impute_numeric(df,n_neighbors=5)

#Make sure numerical was well split

print("Purchase value counts:")
print(df_imputed['Purchase'].value_counts())
print("\nPurchase proportions:")
print(df_imputed['Purchase'].value_counts(normalize=True))

num_cols = df.select_dtypes(include=[np.number]).columns.drop("id").tolist()

print("\nüìä Summary statistics for numeric columns after KNN imputation:")
print(df_imputed[num_cols].describe().T[['mean', 'std', 'min', 'max']])

missing = df_imputed[num_cols].isna().sum()
print("\n  Missing values remaining per numeric column:")
print(missing[missing > 0] if missing.sum() > 0 else "‚úÖ No missing values left!")

try:
    num_cols_orig = df.select_dtypes(include=[np.number]).columns
    comparison = pd.DataFrame({
        'mean_before': df[num_cols_orig].mean(),
        'mean_after': df_imputed[num_cols_orig].mean(),
        'std_before': df[num_cols_orig].std(),
        'std_after': df_imputed[num_cols_orig].std()
    })
    print("\nüìà Mean and Std before vs after imputation:")
    print(comparison)
except Exception as e:
    print("\n(‚ö†Ô∏è Skipping before/after comparison ‚Äì original df not available or mismatched columns.)")


#Save the csv
df_imputed.to_csv("df_imputed.csv")

#Return the df after minmax normalisation
clean_df = minMax(df_imputed)
print(clean_df.head())


Unique values for Time_of_Day: ['afternoon' 'morning' 'evening']
Unique values for Payment_Method: ['credit' 'cash' 'bank' 'paypal']
Unique values for Referral_Source: ['social_media' 'direct' 'search_engine' 'ads' 'email']
Purchase value counts:
0.0    8679
1.0    5056
Name: Purchase, dtype: int64

Purchase proportions:
0.0    0.631889
1.0    0.368111
Name: Purchase, dtype: float64

üìä Summary statistics for numeric columns after KNN imputation:
                                        mean         std     min           max
Age                                31.904318   12.157652  18.000     65.000000
Gender                              0.493861    0.496050   0.000      1.000000
Reviews_Read                        2.983246    1.708912   0.000     11.000000
Price                             560.345645  748.412434   5.127  14988.319963
Discount                           25.132523   14.818309   0.000     90.000000
Category                            1.998967    1.410946   0.000      4.0

In [25]:
df

Unnamed: 0,id,Age,Gender,Reviews_Read,Price,Discount,Category,Items_In_Cart,Email_Interaction,Socioeconomic_Status_Score,...,PM_RS_Combo_credit:email,PM_RS_Combo_credit:search_engine,PM_RS_Combo_credit:social_media,PM_RS_Combo_paypal:ads,PM_RS_Combo_paypal:direct,PM_RS_Combo_paypal:email,PM_RS_Combo_paypal:search_engine,PM_RS_Combo_paypal:social_media,Campaign_Period_false,Campaign_Period_true
0,1,,1.0,3.0,592.975,22.0,1.0,6.0,0.0,7.26,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2,25.0,1.0,1.0,511.279,12.0,0.0,3.0,1.0,8.30,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,22.0,0.0,3.0,218.360,2.0,1.0,4.0,1.0,6.61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4,24.0,0.0,3.0,313.781,1.0,3.0,0.0,1.0,10.51,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,5,35.0,1.0,1.0,495.088,13.0,0.0,2.0,0.0,8.33,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13730,13731,21.0,1.0,3.0,105.298,26.0,1.0,0.0,1.0,6.14,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13731,13732,21.0,1.0,1.0,205.893,10.0,3.0,2.0,0.0,5.89,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
13732,13733,27.0,0.0,3.0,763.285,42.0,2.0,2.0,0.0,3.92,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13733,13734,27.0,1.0,2.0,276.455,23.0,1.0,5.0,0.0,5.65,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [27]:
import numpy as np

print("Any NaN in X?", np.isnan(X).any())
print("Any inf in X?", np.isinf(X).any())
print("Any NaN in y?", np.isnan(y).any())
print("Any inf in y?", np.isinf(y).any())


Any NaN in X? True
Any inf in X? False
Any NaN in y? False
Any inf in y? False


In [28]:
import pandas as pd
import numpy as np

# Replace NaN in pandas before converting to numpy
df_fixed = df.fillna(df.median(numeric_only=True))

X = df_fixed.drop(columns=["Purchase", "id", "Session_ID"], errors='ignore').values.astype("float32")
y = df_fixed["Purchase"].values.astype("float32")

# Double-check
print("Any NaN in X?", np.isnan(X).any())
print("Any NaN in y?", np.isnan(y).any())


Any NaN in X? False
Any NaN in y? False


In [29]:
import numpy as np
import pandas as pd

df_fixed = df.fillna(df.median(numeric_only=True))

X = df_fixed.drop(columns=["Purchase", "id", "Session_ID"], errors='ignore').values.astype("float32")
y = df_fixed["Purchase"].values.astype("float32")


In [30]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train).astype("float32")
X_test = scaler.transform(X_test).astype("float32")


In [31]:
import tensorflow as tf
from tensorflow.keras import layers, models

model = models.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(10, activation='relu'),
    layers.Dense(10, activation='relu'),
    layers.Dense(10, activation='relu'),
    layers.Dense(10, activation='relu'),
    layers.Dense(10, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 10)                510       
                                                                 
 dense_7 (Dense)             (None, 10)                110       
                                                                 
 dense_8 (Dense)             (None, 10)                110       
                                                                 
 dense_9 (Dense)             (None, 10)                110       
                                                                 
 dense_10 (Dense)            (None, 10)                110       
                                                                 
 dense_11 (Dense)            (None, 1)                 11        
                                                                 
Total params: 961
Trainable params: 961
Non-trainable 

In [32]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=30,
    batch_size=64,
    verbose=1
)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [33]:
from sklearn.metrics import f1_score, classification_report

# Predict probabilities
y_pred_prob = model.predict(X_test)

# Convert to 0/1
y_pred = (y_pred_prob > 0.5).astype("int32")

# F1 Score
print("F1 Score:", f1_score(y_test, y_pred))

# Optional: full report
print(classification_report(y_test, y_pred))


F1 Score: 0.7783837382250868
              precision    recall  f1-score   support

         0.0       0.87      0.87      0.87      1736
         1.0       0.78      0.78      0.78      1011

    accuracy                           0.84      2747
   macro avg       0.83      0.82      0.82      2747
weighted avg       0.84      0.84      0.84      2747



In [None]:
df_test = pd.read_csv("/Users/quentinvillet/oracles_of_paris/kaggle_competition/kaggle/test_dataset_M1_with_id.csv")

print(df_test.shape)


(6265, 21)


In [36]:
def clean_categorical(df_test):
    cat_cols = df_test.select_dtypes(include="object").columns

    for col in cat_cols:
        df_test[col] = df_test[col].astype(str).str.lower().str.strip()
        df_test[col] = df_test[col].replace("nan", np.nan)
        col_lower = col.lower()

        if col_lower == "time_of_day":
            df_test[col] = df_test[col].replace({
                r'^m[0o].*rning$': 'morning',
                r'^aftern?[0o].*n$': 'afternoon',
                r'^even.*g$': 'evening',
                }, regex=True)

        elif col_lower == 'payment_method':
            df_test[col] = df_test[col].replace({
                r'^cred.*$': 'credit',
                r'^cash$': 'cash',
                r'^pay[\s_]?pal$': 'paypal',
                r'^bank.*$': 'bank',
            }, regex=True)

        # Normalize Referral_Source
        elif col_lower == 'referral_source':
            df_test[col] = df_test[col].replace({
                r'^s[0o].*cial.*media$': 'social_media',
                r'^search.*engine$': 'search_engine',
                r'^ads$': 'ads',
                r'^email$': 'email',
                r'^direct$': 'direct',
            }, regex=True)
    return df_test

In [37]:
num_duplicates= df_test.duplicated().sum()

print(f"Number of fully duplicate rows: {num_duplicates}")

#Duplicate ids
num_duplicate_ids = df_test.duplicated(subset=["id"]).sum()

print(f"Number of fully duplicate rows: {num_duplicate_ids}")

Number of fully duplicate rows: 0
Number of fully duplicate rows: 0


In [38]:
def knn_impute_numeric(df_test, n_neighbors=5):
    df_test = df_test.copy()
    num_cols = df_test.select_dtypes(include=[np.number]).columns.drop("id")
    non_num_cols=df_test.select_dtypes(exclude=[np.number]).columns
    df_num = df_test[num_cols]

    imputer = KNNImputer(n_neighbors = n_neighbors, weights="distance")
    df_num_imputed = pd.DataFrame(imputer.fit_transform(df_num), columns=num_cols, index=df.index)
    df_imputed = pd.concat([df["id"],df_num_imputed, df[non_num_cols]], axis=1)

    return df_imputed


In [40]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

def minMax(df_test):

    num_cols=df_test.select_dtypes(include=[np.number]).columns.drop("id")
    scaler = MinMaxScaler()
    df_test[num_cols] = scaler.fit_transform(df_test[num_cols])
    return df_test


In [41]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

def cat_encoding(df_test):

    #Encode categorical variable to allow knn imputation.
    cat_columns = df_test.select_dtypes(exclude=["number"]).columns
    cat_columns = cat_columns.drop("Session_ID")
    cat_imputer = SimpleImputer(strategy="most_frequent")
    df_test[cat_columns] = cat_imputer.fit_transform(df_test[cat_columns])
    encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoded = encoder.fit_transform(df_test[cat_columns])
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(cat_columns), index=df_test.index)
    df_sklearn_encoded = pd.concat([df.drop(columns = cat_columns), one_hot_df], axis=1)

    return df_sklearn_encoded


In [45]:
import numpy as np
import pandas as pd

from knn_imputation import knn_impute_numeric
from cleaning import clean_categorical
from one_hot import cat_encoding
from minmax import minMax


df_test = pd.read_csv('/Users/quentinvillet/oracles_of_paris/kaggle_competition/kaggle/test_dataset_M1_with_id.csv')

#Clean the data (I only clean the 3 columns maybe need to expand to the rest)
df_test = clean_categorical(df_test)

#Verify categorical was well cleaned
for col in ["Time_of_Day", "Payment_Method", "Referral_Source"]:
    print(f"Unique values for {col}: {df_test[col].dropna().unique()}")

#Encode categorical variable using one hot encoding
df_test = cat_encoding(df_test)

#Impute numerical, Need to add categorical once Encoded
df_imputed = knn_impute_numeric(df_test,n_neighbors=5)

#Make sure numerical was well split

print("Purchase value counts:")
print(df_imputed['Purchase'].value_counts())
print("\nPurchase proportions:")
print(df_imputed['Purchase'].value_counts(normalize=True))

num_cols = df_test.select_dtypes(include=[np.number]).columns.drop("id").tolist()

print("\nüìä Summary statistics for numeric columns after KNN imputation:")
print(df_imputed[num_cols].describe().T[['mean', 'std', 'min', 'max']])

missing = df_imputed[num_cols].isna().sum()
print("\n  Missing values remaining per numeric column:")
print(missing[missing > 0] if missing.sum() > 0 else "‚úÖ No missing values left!")

try:
    num_cols_orig = df.select_dtypes(include=[np.number]).columns
    comparison = pd.DataFrame({
        'mean_before': df_test[num_cols_orig].mean(),
        'mean_after': df_imputed[num_cols_orig].mean(),
        'std_before': df_test[num_cols_orig].std(),
        'std_after': df_imputed[num_cols_orig].std()
    })
    print("\nüìà Mean and Std before vs after imputation:")
    print(comparison)
except Exception as e:
    print("\n(‚ö†Ô∏è Skipping before/after comparison ‚Äì original df not available or mismatched columns.)")


#Save the csv
df_imputed.to_csv("df_imputed.csv")

#Return the df after minmax normalisation
clean_df = minMax(df_imputed)
print(clean_df.head())


Unique values for Time_of_Day: ['morning' 'evening' 'afternoon']
Unique values for Payment_Method: ['paypal' 'cash' 'bank' 'credit']
Unique values for Referral_Source: ['direct' 'search_engine' 'ads' 'social_media' 'email']
Purchase value counts:


KeyError: 'Purchase'