In [2]:
import pandas as pd
file_path=("/Users/saranya/Documents/Projects/customer_conversion_project/test_data.xlsx")
test_df=pd.read_excel(file_path)

In [3]:
file_path=("/Users/saranya/Documents/Projects/customer_conversion_project/train_data.xlsx")
train_df=pd.read_excel(file_path)

In [4]:
train_df.dtypes

year                     int64
month                    int64
day                      int64
order                    int64
country                  int64
session_id               int64
page1_main_category      int64
page2_clothing_model    object
colour                   int64
location                 int64
model_photography        int64
price                    int64
price_2                  int64
page                     int64
dtype: object

In [5]:
print("Train shape: ",train_df.shape )
print("Test shape: ",test_df.shape )
train_df.head()

Train shape:  (132379, 14)
Test shape:  (33095, 14)


Unnamed: 0,year,month,day,order,country,session_id,page1_main_category,page2_clothing_model,colour,location,model_photography,price,price_2,page
0,2008,6,22,21,29,15648,3,C20,13,1,2,48,1,2
1,2008,5,19,6,29,10018,2,B26,13,3,1,57,1,2
2,2008,7,15,2,29,19388,3,C13,9,5,1,48,1,1
3,2008,5,2,2,29,7181,2,B11,2,4,1,43,2,1
4,2008,6,9,16,29,13493,2,B31,9,5,1,57,1,2


In [6]:
train_df['page2_clothing_model'].value_counts()

page2_clothing_model
B4     2824
A2     2389
A11    2247
P1     2134
B10    2011
       ... 
P31      64
P66      42
P79       2
P22       2
P54       1
Name: count, Length: 216, dtype: int64

In [7]:
#Encoding
from sklearn.preprocessing import LabelEncoder
encoders={}
for col in train_df.columns:
    if train_df[col].dtype==object:
        label_encoder=LabelEncoder()
        train_df[col]=label_encoder.fit_transform(train_df[col])
        encoders[col]=label_encoder


In [8]:
encoders

{'page2_clothing_model': LabelEncoder()}

In [9]:
# save encoder in pickle file
import pickle
with open("label_encoder.pkl",'wb')as f:
    pickle.dump(encoders,f)

print("label encoders saved successfully")


label encoders saved successfully


In [10]:
print(train_df.dtypes)

year                    int64
month                   int64
day                     int64
order                   int64
country                 int64
session_id              int64
page1_main_category     int64
page2_clothing_model    int64
colour                  int64
location                int64
model_photography       int64
price                   int64
price_2                 int64
page                    int64
dtype: object


In [11]:
train_df.head()

Unnamed: 0,year,month,day,order,country,session_id,page1_main_category,page2_clothing_model,colour,location,model_photography,price,price_2,page
0,2008,6,22,21,29,15648,3,88,13,1,2,48,1,2
1,2008,5,19,6,29,10018,2,60,13,3,1,57,1,2
2,2008,7,15,2,29,19388,3,80,9,5,1,48,1,1
3,2008,5,2,2,29,7181,2,45,2,4,1,43,2,1
4,2008,6,9,16,29,13493,2,66,9,5,1,57,1,2


In [12]:
#create target in train_df

train_df['purchase_flag']=train_df['order'].apply(lambda x:1 if x>=4 else 0)

#Check distribution 
print(train_df['purchase_flag'].value_counts())


purchase_flag
1    85422
0    46957
Name: count, dtype: int64


In [13]:
# regression target 

train_df['revenue']=train_df['price']*train_df['price_2']
train_df.drop(["price","price_2"],axis=1,inplace=True)

In [14]:
train_df.head()

Unnamed: 0,year,month,day,order,country,session_id,page1_main_category,page2_clothing_model,colour,location,model_photography,page,purchase_flag,revenue
0,2008,6,22,21,29,15648,3,88,13,1,2,2,1,48
1,2008,5,19,6,29,10018,2,60,13,3,1,2,1,57
2,2008,7,15,2,29,19388,3,80,9,5,1,1,0,48
3,2008,5,2,2,29,7181,2,45,2,4,1,1,0,86
4,2008,6,9,16,29,13493,2,66,9,5,1,2,1,57


In [15]:
from imblearn.over_sampling import SMOTENC ,SMOTE

#Define feature(x) and target(y)
x=train_df.drop('purchase_flag',axis=1)
y=train_df['purchase_flag']

categorical_col=['country','colour','page1_main_category','page2_clothing_model']

#SMOTE needs categorical columns as indices for non continuous features
categorical_indices=[x.columns.get_loc(col) for col in categorical_col]

#Apply SMOTE
smote=SMOTENC(categorical_features=categorical_indices,random_state=42)
x_resampled,y_resampled=smote.fit_resample(x,y)

#convert to dataframe 

df_resampled=pd.DataFrame(x_resampled,columns=x.columns)
df_resampled['purchase_flag']=y_resampled

#verify new class distribution 
df_resampled['purchase_flag'].value_counts()

purchase_flag
1    85422
0    85422
Name: count, dtype: int64

In [16]:
import warnings
import numpy as num
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder,StandardScaler

warnings.filterwarnings("ignore")

In [None]:
# model building 

from sklearn.metrics import accuracy_score,roc_auc_score,roc_curve
import matplotlib.pyplot as plt

#Define column groups

nominal_data=['country','page1_main_category','page2_clothing_model','colour','location','model_photography']

log_scale_cols=['order','revenue','page']

#Define transformation pipelines

log_scale_pipeline=pipeline([('log',FunctionTransformer(np.log1p,validate=True)),('scaler',StandardScaler())])


#combine Transformation 

preprocessor=ColumnTransformer(transformers=[('label',OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1),nominal_data),('log_scale_pipeline',log_scale_cols)],remainder='passthrough')

#define full pipeline with classifier