# ***Libraries Used***

In [1]:
# Data-frame manipulation and visualization
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Pre-Processing and Data Wrangling Libraries
from sklearn.preprocessing import ( StandardScaler, MinMaxScaler, 
                                    LabelEncoder, OneHotEncoder, OrdinalEncoder
                                    )

from sklearn.model_selection import (   train_test_split, 
                                        StratifiedKFold,
                                        cross_val_score, validation_curve, learning_curve, LearningCurveDisplay
                                    )


# Loading and first impressions of dataset

In [3]:
test_data= pd.read_csv(r"customer_churn_dataset-testing-master.csv")

In [4]:
test_data.head(10)

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,1,22,Female,25,14,4,27,Basic,Monthly,598,9,1
1,2,41,Female,28,28,7,13,Standard,Monthly,584,20,0
2,3,47,Male,27,10,2,29,Premium,Annual,757,21,0
3,4,35,Male,9,12,5,17,Premium,Quarterly,232,18,0
4,5,53,Female,58,24,9,2,Standard,Annual,533,18,0
5,6,30,Male,41,14,10,10,Premium,Monthly,500,29,0
6,7,47,Female,37,15,9,28,Basic,Quarterly,574,14,1
7,8,54,Female,36,11,0,18,Standard,Monthly,323,16,0
8,9,36,Male,20,5,10,8,Basic,Monthly,687,8,0
9,10,65,Male,8,4,2,23,Basic,Annual,995,10,0


In [5]:
df= pd.read_csv(r"customer_churn_dataset-training-master.csv")

In [6]:
df.head(10)

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,2.0,30.0,Female,39.0,14.0,5.0,18.0,Standard,Annual,932.0,17.0,1.0
1,3.0,65.0,Female,49.0,1.0,10.0,8.0,Basic,Monthly,557.0,6.0,1.0
2,4.0,55.0,Female,14.0,4.0,6.0,18.0,Basic,Quarterly,185.0,3.0,1.0
3,5.0,58.0,Male,38.0,21.0,7.0,7.0,Standard,Monthly,396.0,29.0,1.0
4,6.0,23.0,Male,32.0,20.0,5.0,8.0,Basic,Monthly,617.0,20.0,1.0
5,8.0,51.0,Male,33.0,25.0,9.0,26.0,Premium,Annual,129.0,8.0,1.0
6,9.0,58.0,Female,49.0,12.0,3.0,16.0,Standard,Quarterly,821.0,24.0,1.0
7,10.0,55.0,Female,37.0,8.0,4.0,15.0,Premium,Annual,445.0,30.0,1.0
8,11.0,39.0,Male,12.0,5.0,7.0,4.0,Standard,Quarterly,969.0,13.0,1.0
9,12.0,64.0,Female,3.0,25.0,2.0,11.0,Standard,Quarterly,415.0,29.0,1.0


In [7]:
df.describe()

Unnamed: 0,CustomerID,Age,Tenure,Usage Frequency,Support Calls,Payment Delay,Total Spend,Last Interaction,Churn
count,440832.0,440832.0,440832.0,440832.0,440832.0,440832.0,440832.0,440832.0,440832.0
mean,225398.667955,39.373153,31.256336,15.807494,3.604437,12.965722,631.616223,14.480868,0.567107
std,129531.91855,12.442369,17.255727,8.586242,3.070218,8.258063,240.803001,8.596208,0.495477
min,2.0,18.0,1.0,1.0,0.0,0.0,100.0,1.0,0.0
25%,113621.75,29.0,16.0,9.0,1.0,6.0,480.0,7.0,0.0
50%,226125.5,39.0,32.0,16.0,3.0,12.0,661.0,14.0,1.0
75%,337739.25,48.0,46.0,23.0,6.0,19.0,830.0,22.0,1.0
max,449999.0,65.0,60.0,30.0,10.0,30.0,1000.0,30.0,1.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440833 entries, 0 to 440832
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   CustomerID         440832 non-null  float64
 1   Age                440832 non-null  float64
 2   Gender             440832 non-null  object 
 3   Tenure             440832 non-null  float64
 4   Usage Frequency    440832 non-null  float64
 5   Support Calls      440832 non-null  float64
 6   Payment Delay      440832 non-null  float64
 7   Subscription Type  440832 non-null  object 
 8   Contract Length    440832 non-null  object 
 9   Total Spend        440832 non-null  float64
 10  Last Interaction   440832 non-null  float64
 11  Churn              440832 non-null  float64
dtypes: float64(9), object(3)
memory usage: 40.4+ MB


# EDA, Visualization and Preprocessing

In [9]:
df.isnull().sum()

CustomerID           1
Age                  1
Gender               1
Tenure               1
Usage Frequency      1
Support Calls        1
Payment Delay        1
Subscription Type    1
Contract Length      1
Total Spend          1
Last Interaction     1
Churn                1
dtype: int64

**It seems that there is only a single row of null values in our dataset. Lets try and verify this**

In [10]:
df[df.isnull().any(axis=1)]

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
199295,,,,,,,,,,,,


**This shows that entry number 199295 was null. Now the most appropriate way to handle the null row is to drop it. This is because it is a single row in a huge dataset which would not make any difference whatsoever**

In [11]:
df= df.dropna()

In [12]:
df.isnull().sum()

CustomerID           0
Age                  0
Gender               0
Tenure               0
Usage Frequency      0
Support Calls        0
Payment Delay        0
Subscription Type    0
Contract Length      0
Total Spend          0
Last Interaction     0
Churn                0
dtype: int64

In [13]:
df.shape

(440832, 12)

Checking for any duplicates

In [14]:
df.duplicated().sum()

0

No duplicates present

In [15]:
categorical_cols= df.select_dtypes(include='object').columns

In [16]:
categorical_cols

Index(['Gender', 'Subscription Type', 'Contract Length'], dtype='object')

In [17]:
df[categorical_cols].describe()

Unnamed: 0,Gender,Subscription Type,Contract Length
count,440832,440832,440832
unique,2,3,3
top,Male,Standard,Annual
freq,250252,149128,177198


# We will like to present optimal encoder choices to optimally encode the categorical columns in our dataset. 

In [18]:
for i in categorical_cols:
    print(df[i].value_counts())
    print()

Gender
Male      250252
Female    190580
Name: count, dtype: int64

Subscription Type
Standard    149128
Premium     148678
Basic       143026
Name: count, dtype: int64

Contract Length
Annual       177198
Quarterly    176530
Monthly       87104
Name: count, dtype: int64



For the 'Gender' column, the most obvious choice is One Hot Encoder as there is no ordinal relation between the 2 Genders, Male and Female. 
However, Subscription Type and Contract Length seem to have an ordinal relationship between classes.
For instances, a subscription type Premium is more expensive/higher then basic, or, Annual Contract Length is more then monthly...

In [19]:
ohe= OneHotEncoder(sparse_output=False).set_output(transform='pandas')

ord= OrdinalEncoder(categories=[
    ['Basic', 'Standard', 'Premium'], 
    ['Monthly', 'Quarterly', 'Annual']
    ])

In [20]:
gender_encoded= ohe.fit_transform(df[['Gender']])
gender_encoded

Unnamed: 0,Gender_Female,Gender_Male
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,0.0,1.0
4,0.0,1.0
...,...,...
440828,0.0,1.0
440829,1.0,0.0
440830,0.0,1.0
440831,0.0,1.0


In [21]:
df= pd.concat([df,gender_encoded], axis=1).drop(columns=['Gender'])
df.head(10)

Unnamed: 0,CustomerID,Age,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn,Gender_Female,Gender_Male
0,2.0,30.0,39.0,14.0,5.0,18.0,Standard,Annual,932.0,17.0,1.0,1.0,0.0
1,3.0,65.0,49.0,1.0,10.0,8.0,Basic,Monthly,557.0,6.0,1.0,1.0,0.0
2,4.0,55.0,14.0,4.0,6.0,18.0,Basic,Quarterly,185.0,3.0,1.0,1.0,0.0
3,5.0,58.0,38.0,21.0,7.0,7.0,Standard,Monthly,396.0,29.0,1.0,0.0,1.0
4,6.0,23.0,32.0,20.0,5.0,8.0,Basic,Monthly,617.0,20.0,1.0,0.0,1.0
5,8.0,51.0,33.0,25.0,9.0,26.0,Premium,Annual,129.0,8.0,1.0,0.0,1.0
6,9.0,58.0,49.0,12.0,3.0,16.0,Standard,Quarterly,821.0,24.0,1.0,1.0,0.0
7,10.0,55.0,37.0,8.0,4.0,15.0,Premium,Annual,445.0,30.0,1.0,1.0,0.0
8,11.0,39.0,12.0,5.0,7.0,4.0,Standard,Quarterly,969.0,13.0,1.0,0.0,1.0
9,12.0,64.0,3.0,25.0,2.0,11.0,Standard,Quarterly,415.0,29.0,1.0,1.0,0.0


In [22]:
df[['Subscription Type', 'Contract Length']]= ord.fit_transform(df[['Subscription Type', 'Contract Length']])
df.head(10)

Unnamed: 0,CustomerID,Age,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn,Gender_Female,Gender_Male
0,2.0,30.0,39.0,14.0,5.0,18.0,1.0,2.0,932.0,17.0,1.0,1.0,0.0
1,3.0,65.0,49.0,1.0,10.0,8.0,0.0,0.0,557.0,6.0,1.0,1.0,0.0
2,4.0,55.0,14.0,4.0,6.0,18.0,0.0,1.0,185.0,3.0,1.0,1.0,0.0
3,5.0,58.0,38.0,21.0,7.0,7.0,1.0,0.0,396.0,29.0,1.0,0.0,1.0
4,6.0,23.0,32.0,20.0,5.0,8.0,0.0,0.0,617.0,20.0,1.0,0.0,1.0
5,8.0,51.0,33.0,25.0,9.0,26.0,2.0,2.0,129.0,8.0,1.0,0.0,1.0
6,9.0,58.0,49.0,12.0,3.0,16.0,1.0,1.0,821.0,24.0,1.0,1.0,0.0
7,10.0,55.0,37.0,8.0,4.0,15.0,2.0,2.0,445.0,30.0,1.0,1.0,0.0
8,11.0,39.0,12.0,5.0,7.0,4.0,1.0,1.0,969.0,13.0,1.0,0.0,1.0
9,12.0,64.0,3.0,25.0,2.0,11.0,1.0,1.0,415.0,29.0,1.0,1.0,0.0
