In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,f1_score,precision_score,recall_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/content/6_ecom_v2.csv')

In [None]:
df.sample(5)

Unnamed: 0,Churn,Tenure,CityTier,WarehouseToHome,PreferredPaymentMode,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,OrderCount,DaySinceLastOrder,CashbackAmount
4837,0,6,1,8.0,COD,4,Laptop & Accessory,5,Married,6,No,15,2,3.0,165.87
661,0,0,1,14.0,Debit Card,5,Phone,3,Divorced,2,No,19,1,2.0,146.21
2998,0,7,1,14.0,Credit Card,4,Fashion,3,Divorced,2,No,12,2,8.0,241.91
546,0,9,1,16.0,Debit Card,3,Phone,4,Married,2,No,14,1,0.0,120.08
772,0,30,3,9.0,Debit Card,4,Laptop & Accessory,2,Single,6,No,17,1,5.0,151.71


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5058 entries, 0 to 5057
Data columns (total 15 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Churn                        5058 non-null   int64  
 1   Tenure                       5058 non-null   int64  
 2   CityTier                     5058 non-null   int64  
 3   WarehouseToHome              5058 non-null   float64
 4   PreferredPaymentMode         5058 non-null   object 
 5   NumberOfDeviceRegistered     5058 non-null   int64  
 6   PreferedOrderCat             5058 non-null   object 
 7   SatisfactionScore            5058 non-null   int64  
 8   MaritalStatus                5058 non-null   object 
 9   NumberOfAddress              5058 non-null   int64  
 10  Complain                     5058 non-null   object 
 11  OrderAmountHikeFromlastYear  5058 non-null   int64  
 12  OrderCount                   5058 non-null   int64  
 13  DaySinceLastOrder 

In [None]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.1, random_state=42)
df = df_train

In [None]:
df_test.to_csv('test_data.csv',index=False)

In [None]:
num = ['Tenure', 'CityTier', 'WarehouseToHome',
       'NumberOfDeviceRegistered', 'SatisfactionScore', 'NumberOfAddress',
       'OrderAmountHikeFromlastYear', 'OrderCount', 'DaySinceLastOrder',
       'CashbackAmount']

In [None]:
cat = ['PreferredPaymentMode', 'PreferedOrderCat', 'MaritalStatus',
       'Complain']

## Ordinal Encoding

In [None]:
pre_processor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),num),
        ('cat', OrdinalEncoder(), cat)
    ],
        remainder='passthrough'
)

In [None]:
pipeline = Pipeline([
        ('preprocessor',pre_processor),
        ('model',LogisticRegression())
]
)

In [None]:
scores = cross_val_score(pipeline,df.drop('Churn',axis=1),df['Churn'],cv=5,scoring='recall')
scores.mean()

np.float64(0.4039595499216636)

In [None]:
def evaluate_model(model_name,model):
  out = []
  out.append(model_name)

  pipeline = Pipeline([
        ('preprocessor',pre_processor),
        ('model',model)
  ])
  kfold = KFold(n_splits=10,shuffle=True,random_state=42)
  score = cross_val_score(pipeline,df.drop('Churn',axis=1),df['Churn'],cv=kfold,scoring='recall')

  out.append(score.mean())

  return out

In [None]:
models = {
    'Logistic Regression': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Naive Bayes': GaussianNB(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(),
}

In [None]:
model_ouput = []
for model_name,model in models.items():
  model_ouput.append(evaluate_model(model_name,model))

In [None]:
model_df = pd.DataFrame(model_ouput,columns=['Model','Recall']).sort_values(by='Recall',ascending=False)

In [None]:
model_df

Unnamed: 0,Model,Recall
7,XGBoost,0.821282
2,Decision Tree,0.799913
3,Random Forest,0.760825
6,Gradient Boosting,0.583902
5,Naive Bayes,0.512211
1,KNN,0.500548
4,SVM,0.42601
0,Logistic Regression,0.39358


## OneHot Encoding

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),num),
        ('cat', OneHotEncoder(drop='first'), cat)
    ],
        remainder='passthrough'
)

In [None]:
pipeline = Pipeline([
        ('preprocessor',preprocessor),
        ('model',LogisticRegression())
]
)

In [None]:
kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,df.drop('Churn',axis=1),df['Churn'],cv=kfold,scoring='recall')
score.mean()

np.float64(0.46885879217262943)

In [None]:
def evaluate_model_2(model_name,model):
  out = []
  out.append(model_name)

  pipeline = Pipeline([
        ('preprocessor',preprocessor),
        ('model',model)
  ])
  kfold = KFold(n_splits=10,shuffle=True,random_state=42)
  score = cross_val_score(pipeline,df.drop('Churn',axis=1),df['Churn'],cv=kfold,scoring='recall')

  out.append(score.mean())

  return out



In [None]:
models = {
    'Logistic Regression': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Naive Bayes': GaussianNB(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier()
}

In [None]:
model_out = []
for model_name,model in models.items():
  model_out.append(evaluate_model_2(model_name,model))


In [None]:
model_df = pd.DataFrame(model_out,columns=['Model','Recall']).sort_values(by='Recall',ascending=False)
model_df

Unnamed: 0,Model,Recall
7,XGBoost,0.796719
2,Decision Tree,0.789758
3,Random Forest,0.739868
5,Naive Bayes,0.71317
6,Gradient Boosting,0.608139
4,SVM,0.499927
1,KNN,0.489422
0,Logistic Regression,0.468859


- **we get better recall score for ordinal encoding**

# final model pipeline with hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'model__n_estimators': [200,250,300],
    'model__max_depth': [10,12,15],
    'model__scale_pos_weight': [5,7,10],               # To handle class imbalance (higher for more imbalance)
    'model__min_child_weight': [1, 3, 5],               # Minimum sum of instance weight needed in a child
    'model__reg_alpha': [0.1, 0.3,0.5],                   # L1 regularization (sparse model)
}

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),num),
        ('cat', OrdinalEncoder(), cat)
    ],
        remainder='passthrough'
)

In [None]:
pipeline = Pipeline([
        ('preprocessor',preprocessor),
        ('model',XGBClassifier())
]
)

In [None]:
kfold = KFold(n_splits=5,shuffle=True,random_state=42)
search = GridSearchCV(pipeline,param_grid=param_grid,cv=kfold,scoring='recall',n_jobs=-1)

## for final model training on whole data

In [None]:
search.fit(df.drop('Churn',axis=1),df['Churn'])

In [None]:
final_pipe = search.best_estimator_

In [None]:
search.best_params_

{'model__max_depth': 10,
 'model__min_child_weight': 3,
 'model__n_estimators': 200,
 'model__reg_alpha': 0.1,
 'model__scale_pos_weight': 10}

In [None]:
search.best_score_

np.float64(0.848462516349057)

In [None]:
y_pred = final_pipe.predict(df_test.drop('Churn',axis=1))
recall_score(df_test['Churn'],y_pred)

0.9080459770114943

In [None]:
f1_score(df_test['Churn'],y_pred)

0.8977272727272727

In [None]:
accuracy_score(df_test['Churn'],y_pred)

0.9555555555555556

In [None]:
final_pipe.fit(df.drop('Churn',axis=1),df['Churn'])

## exporting model

In [None]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(final_pipe, file)