In [33]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import  Pipeline,make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [9]:
# data ingestion
df = pd.read_csv(r"C:\Users\Sumit\Downloads\customer_data_model_refined (4) (2).csv", on_bad_lines='skip')
df.head()

Unnamed: 0,CustomerID,Churn,Tenure,PreferredLoginDevice,CityTier,WarehouseToHome,PreferredPaymentMode,Gender,HourSpendOnApp,NumberOfDeviceRegistered,...,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,OrderAmountHikeFromLastYear,CouponUsed,OrderCount,DaySinceLastOrder,CashbackAmount,Unnamed: 20
0,856,1,12,Desktop,0,0,No Payment,Female,0,1,...,4,0,0,1,0.0,0,0,0,0.0,
1,857,1,12,Desktop,0,0,No Payment,Male,0,1,...,1,0,0,1,0.0,0,0,0,0.0,
2,858,1,12,Desktop,0,0,No Payment,Male,0,1,...,1,0,0,0,0.0,0,0,0,0.0,
3,859,1,12,Desktop,0,0,No Payment,Male,0,1,...,1,0,0,0,0.0,0,0,0,0.0,
4,860,1,12,Desktop,0,0,No Payment,Female,0,1,...,1,0,0,1,0.0,0,0,0,0.0,


In [10]:
df = df.drop(columns='Unnamed: 20')
# df = df.drop(columns='CustomerID')
df = df.drop(columns='CouponUsed')

In [11]:
df['Churn'].value_counts()

Churn
1    1172
0    1163
Name: count, dtype: int64

In [12]:

df.isnull().sum()

CustomerID                     0
Churn                          0
Tenure                         0
PreferredLoginDevice           0
CityTier                       0
WarehouseToHome                0
PreferredPaymentMode           0
Gender                         0
HourSpendOnApp                 0
NumberOfDeviceRegistered       0
PreferredOrderCat              0
SatisfactionScore              0
MaritalStatus                  0
NumberOfAddress                0
Complain                       0
OrderAmountHikeFromLastYear    0
OrderCount                     0
DaySinceLastOrder              0
CashbackAmount                 0
dtype: int64

In [13]:
df.duplicated().sum()

np.int64(0)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335 entries, 0 to 2334
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   CustomerID                   2335 non-null   int64  
 1   Churn                        2335 non-null   int64  
 2   Tenure                       2335 non-null   int64  
 3   PreferredLoginDevice         2335 non-null   object 
 4   CityTier                     2335 non-null   int64  
 5   WarehouseToHome              2335 non-null   int64  
 6   PreferredPaymentMode         2335 non-null   object 
 7   Gender                       2335 non-null   object 
 8   HourSpendOnApp               2335 non-null   int64  
 9   NumberOfDeviceRegistered     2335 non-null   int64  
 10  PreferredOrderCat            2335 non-null   object 
 11  SatisfactionScore            2335 non-null   int64  
 12  MaritalStatus                2335 non-null   int64  
 13  NumberOfAddress   

In [15]:
# Separate numerical columns
numerical_cols = df.select_dtypes(include=['number']).columns

# Separate categorical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns


In [16]:
print("Numerical Columns:", numerical_cols)

Numerical Columns: Index(['CustomerID', 'Churn', 'Tenure', 'CityTier', 'WarehouseToHome',
       'HourSpendOnApp', 'NumberOfDeviceRegistered', 'SatisfactionScore',
       'MaritalStatus', 'NumberOfAddress', 'Complain',
       'OrderAmountHikeFromLastYear', 'OrderCount', 'DaySinceLastOrder',
       'CashbackAmount'],
      dtype='object')


In [17]:
print("categorical Columns:", categorical_cols)

categorical Columns: Index(['PreferredLoginDevice', 'PreferredPaymentMode', 'Gender',
       'PreferredOrderCat'],
      dtype='object')


In [18]:
categories_to_replace=["Payment by installment (Hire Purchase by Roger's Capital)","Payment by installment (Hire Purchase by CIM Finance)"
    
]

df['PreferredPaymentMode'] = df['PreferredPaymentMode'].replace(categories_to_replace, 'Installment')


categories_to_replace=['Mips Juice',"Mips Bank Transfer",'Mips POP']

df['PreferredPaymentMode'] = df['PreferredPaymentMode'].replace(categories_to_replace, 'Mips')
df['PreferredPaymentMode'].value_counts()

PreferredPaymentMode
No Payment                         1146
Installment                         321
Pay Now                             274
Pay on Delivery                     250
Mips                                169
Bank Transfer Payment               141
No Payment Information Required      34
Name: count, dtype: int64

In [19]:
# List of categories to be replaced with 'Electronics'
categories_to_replace = [
    'Mobile', 'Smartphones', 'Computing', 'Wearables', 'TV & Audio', 
    'Television', 'Televisions', 'Portable Speaker', 'Computer Speakers', 
    'High Pressure Cleaners', 'Laptops', 'Desktop PCs, Laptops & Notebooks', 
    'Networking', 'Monitor', 'Mobile Phone Accessories', 'Tablets and Accessories', 
    'Headset', 'Keyboard & Mouse','Desktop Pcs  Laptops & Notebooks','Headphones','All in One Printer','Other Computer Accessories','Desktop Pcs  Laptops & Notebooks '
]

# Replace the specified categories with 'Electronics'
df['PreferredOrderCat'] = df['PreferredOrderCat'].replace(categories_to_replace, 'Electronics')



# List of categories to be replaced with 'Home Appliances'
categories_to_replace_home_appliances = [
    'Small Appliances', 'Home Appliances', 'Refrigerators', 'Microwaves', 
    'Wet & Dry Vacuum Cleaner', 'Stick Vacuum Cleaners', 'Air Purifiers', 
    'Fryers', 'Cookwares', 'Built-in Ovens', 'Gas Water Heaters', 'Kettles', 
    'Cooker Hoods', 'Hair Dryers', 'Air Conditioner', 'Tools', 
    'Hedge Trimmer', 'Hot Deals', 'Dry Vacuum Cleaners','Food Processor'
]

# Replace the specified categories with 'Home Appliances'
df['PreferredOrderCat'] = df['PreferredOrderCat'].replace(categories_to_replace_home_appliances, 'Home Appliances')


# List of categories to be replaced with 'Others'
categories_to_replace_others = [
    'Hair Dryers', 'Musical Instruments', 'Guitar', 
    'Furniture & Deco', 'Leisure & Transport', 
    'Food Processor', 'Showcase', 
    'No Category Found', 'Mini'
]

# Replace the specified categories with 'Others'
df['PreferredOrderCat'] = df['PreferredOrderCat'].replace(categories_to_replace_others, 'Others')

df['PreferredOrderCat'].value_counts()



PreferredOrderCat
No Order            1146
Home Appliances      530
Electronics          490
Others                96
Beauty & Fitness      73
Name: count, dtype: int64

In [20]:

tenure_data = df['NumberOfAddress']

# Quartiles
Q1 = tenure_data.quantile(0.25)
median = tenure_data.median()
Q3 = tenure_data.quantile(0.75)

# Interquartile range
IQR = Q3 - Q1

# Whiskers
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR

# print('Lower wishker',lower_whisker)
# print('upper wishker',upper_range)

# Outliers
outliers = tenure_data[(tenure_data < lower_whisker) | (tenure_data > upper_whisker)]

# Summary of the box plot statistics
box_plot_info = {
    "First Quartile (Q1)": Q1,
    "Median (Q2)": median,
    "Third Quartile (Q3)": Q3,
    "Interquartile Range (IQR)": IQR,
    'max_value':tenure_data.max(),
    'min_value':tenure_data.min(),
    'Lower wishker':lower_whisker,
    'upper wishker': upper_whisker,

    "Lower Whisker (Min without outliers)": max(tenure_data.min(), lower_whisker),
    "Upper Whisker (Max without outliers)": min(tenure_data.max(), upper_whisker),
    "Outliers": outliers.values,
    "Total Outliers": len(outliers)
}

for k,v in box_plot_info.items():
    print(f'{k} : {v}')


First Quartile (Q1) : 0.0
Median (Q2) : 1.0
Third Quartile (Q3) : 1.0
Interquartile Range (IQR) : 1.0
max_value : 19
min_value : 0
Lower wishker : -1.5
upper wishker : 2.5
Lower Whisker (Min without outliers) : 0
Upper Whisker (Max without outliers) : 2.5
Outliers : [ 3  3  3  4 10  6  4 19  4  3  3  3  4 11  3  3 15  3  3  3  3  6  3  6
  3  4  3  3  3  3  3  3 13  4  5  3  5  3  4 17  5  4 18  6  3  8  3  3
  4  7  3  3  8  3  3  3  3  4  3  3  3]
Total Outliers : 61


In [21]:
df['NumberOfAddress']=np.where(
    df['NumberOfAddress']>5,
    5,
    np.where(df['NumberOfAddress']< lower_whisker,
             lower_whisker,df['NumberOfAddress']))

In [22]:
tenure_data = df['OrderAmountHikeFromLastYear']

# Quartiles
Q1 = tenure_data.quantile(0.25)
median = tenure_data.median()
Q3 = tenure_data.quantile(0.75)

# Interquartile range
IQR = Q3 - Q1

# Whiskers
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR

# print('Lower wishker',lower_whisker)
# print('upper wishker',upper_range)

# Outliers
outliers = tenure_data[(tenure_data < lower_whisker) | (tenure_data > upper_whisker)]

# Summary of the box plot statistics
box_plot_info = {
    "First Quartile (Q1)": Q1,
    "Median (Q2)": median,
    "Third Quartile (Q3)": Q3,
    "Interquartile Range (IQR)": IQR,
    'max_value':tenure_data.max(),
    'min_value':tenure_data.min(),
    'Lower wishker':lower_whisker,
    'upper wishker': upper_whisker,

    "Lower Whisker (Min without outliers)": max(tenure_data.min(), lower_whisker),
    "Upper Whisker (Max without outliers)": min(tenure_data.max(), upper_whisker),
    "Outliers": outliers.values,
    "Total Outliers": len(outliers)
}


for k,v in box_plot_info.items():
    print(f'{k} : {v}')


First Quartile (Q1) : 0.0
Median (Q2) : 0.0
Third Quartile (Q3) : 0.0
Interquartile Range (IQR) : 0.0
max_value : 88900.0
min_value : -100.0
Lower wishker : 0.0
upper wishker : 0.0
Lower Whisker (Min without outliers) : 0.0
Upper Whisker (Max without outliers) : 0.0
Outliers : [-1.00000000e+02 -1.00000000e+02 -1.00000000e+02 -1.00000000e+02
 -1.00000000e+02 -9.25878430e+01 -1.00000000e+02 -1.00000000e+02
 -1.00000000e+02 -1.00000000e+02 -1.00000000e+02 -1.00000000e+02
 -1.00000000e+02 -1.00000000e+02 -1.00000000e+02 -1.00000000e+02
  4.61123941e+01 -6.65674972e+01  5.83819850e+01  6.40915594e+01
 -6.04384134e+01  1.20423779e+02  1.00502513e+02  1.00000000e+00
  8.89000000e+04 -9.08962597e+01 -7.41482966e+01  2.73556231e+01
 -2.59451446e+01  5.82329317e+02 -4.94362533e+01  2.33592881e+02
  3.07692308e+02 -5.55050045e+01 -3.60824742e+01  5.05050505e+01
  6.51629073e+02  5.94795539e+01 -7.64331210e+01  2.11834320e+02
 -6.22837370e+01  1.20918367e+02  1.36697248e+03 -6.29470672e+01
  2.343

In [23]:
df['OrderAmountHikeFromLastYear']=np.where(
    df['OrderAmountHikeFromLastYear']>upper_whisker,
    upper_whisker,
    np.where(df['OrderAmountHikeFromLastYear']< lower_whisker,
             lower_whisker,df['OrderAmountHikeFromLastYear']))

In [24]:
tenure_data = df['DaySinceLastOrder']

# Quartiles
Q1 = tenure_data.quantile(0.25)
median = tenure_data.median()
Q3 = tenure_data.quantile(0.75)

# Interquartile range
IQR = Q3 - Q1

# Whiskers
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR

# print('Lower wishker',lower_whisker)
# print('upper wishker',upper_range)

# Outliers
outliers = tenure_data[(tenure_data < lower_whisker) | (tenure_data > upper_whisker)]

# Summary of the box plot statistics
box_plot_info = {
    "First Quartile (Q1)": Q1,
    "Median (Q2)": median,
    "Third Quartile (Q3)": Q3,
    "Interquartile Range (IQR)": IQR,
    'max_value':tenure_data.max(),
    'min_value':tenure_data.min(),
    'Lower wishker':lower_whisker,
    'upper wishker': upper_whisker,

    "Lower Whisker (Min without outliers)": max(tenure_data.min(), lower_whisker),
    "Upper Whisker (Max without outliers)": min(tenure_data.max(), upper_whisker),
    "Outliers": outliers.values,
    "Total Outliers": len(outliers)
}


for k,v in box_plot_info.items():
    print(f'{k} : {v}')

First Quartile (Q1) : 0.0
Median (Q2) : 29.0
Third Quartile (Q3) : 113.0
Interquartile Range (IQR) : 113.0
max_value : 330
min_value : 0
Lower wishker : -169.5
upper wishker : 282.5
Lower Whisker (Min without outliers) : 0
Upper Whisker (Max without outliers) : 282.5
Outliers : [306 330 297 297 321 316 329 299 292 311 310 292]
Total Outliers : 12


In [25]:
df['DaySinceLastOrder']=np.where(
    df['DaySinceLastOrder']>upper_whisker,
    upper_whisker,
    np.where(df['DaySinceLastOrder']< lower_whisker,
             lower_whisker,df['DaySinceLastOrder']))

In [26]:
df.head()

Unnamed: 0,CustomerID,Churn,Tenure,PreferredLoginDevice,CityTier,WarehouseToHome,PreferredPaymentMode,Gender,HourSpendOnApp,NumberOfDeviceRegistered,PreferredOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,OrderAmountHikeFromLastYear,OrderCount,DaySinceLastOrder,CashbackAmount
0,856,1,12,Desktop,0,0,No Payment,Female,0,1,No Order,4,0,0.0,1,0.0,0,0.0,0.0
1,857,1,12,Desktop,0,0,No Payment,Male,0,1,No Order,1,0,0.0,1,0.0,0,0.0,0.0
2,858,1,12,Desktop,0,0,No Payment,Male,0,1,No Order,1,0,0.0,0,0.0,0,0.0,0.0
3,859,1,12,Desktop,0,0,No Payment,Male,0,1,No Order,1,0,0.0,0,0.0,0,0.0,0.0
4,860,1,12,Desktop,0,0,No Payment,Female,0,1,No Order,1,0,0.0,1,0.0,0,0.0,0.0


In [27]:
df = df.drop(columns='CustomerID')

# Split the independent and dependent variable

In [30]:
x= df.drop(columns='Churn')
y= df['Churn']

# Split the data into Training and Testing

In [31]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2, random_state=41)

# Exprement tracking

In [32]:
import mlflow
import dagshub

mlflow.set_tracking_uri('https://dagshub.com/Sumitdatascience/Churn_model_development.mlflow')
dagshub.init(repo_owner='Sumitdatascience', repo_name='Churn_model_development', mlflow=True)

mlflow.set_experiment("Decision Tree Baseline")

2024/10/01 14:59:38 INFO mlflow.tracking.fluent: Experiment with name 'Decision Tree Baseline' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/14cc64f64e6a49488e5cb1138a47c269', creation_time=1727774978577, experiment_id='0', last_update_time=1727774978577, lifecycle_stage='active', name='Decision Tree Baseline', tags={}>

In [34]:
trf1 = ColumnTransformer([
    ('Categorical_column_OHE', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'), [1, 4, 5, 8])
], remainder='passthrough')

In [35]:
trf2 = ColumnTransformer([
    ('scale',StandardScaler(),slice(0,24))
])

In [36]:
decisiontreeclassifier = DecisionTreeClassifier()

In [37]:
model = make_pipeline(trf1,trf2,decisiontreeclassifier)

In [38]:
with mlflow.start_run():
    
    # Log preprocessing parameters
    mlflow.log_param("test_size", 0.2)
    
    # Model building and training

    model = make_pipeline(trf1,trf2,decisiontreeclassifier)
    model.fit(x_train, y_train)
    
    # Log model parameters
    mlflow.log_param("model", "Decision Tree")
    
    # Model evaluation
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Log evaluation metrics
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)
    
    # Log model
    mlflow.sklearn.log_model(model, "model")

    # Save and log the notebook
    import os
    notebook_path = "Exp1_baseline_model.ipynb"
    os.system(f"jupyter nbconvert --to notebook --execute --inplace {notebook_path}")
    mlflow.log_artifact(notebook_path)
    
    # Print the results for verification
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")



Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


2024/10/01 15:30:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run caring-finch-470 at: https://dagshub.com/Sumitdatascience/Churn_model_development.mlflow/#/experiments/0/runs/1db4fd4fc1ec42a99a49cfeea76f8d54.
2024/10/01 15:30:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Sumitdatascience/Churn_model_development.mlflow/#/experiments/0.
