In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Data processing and analysis
import numpy as np
import pandas as pd
import math
import re

# Data visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Classification algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis , QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
import lightgbm as lgb
import xgboost as xgb


# Data preprocessing :
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, scale, LabelEncoder, OneHotEncoder


# Modeling helper functions
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV , KFold , cross_val_score



# Classification metrices
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report, precision_score,recall_score,f1_score

### Read Dataset

In [2]:
customer_data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [3]:
customer_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
customer_data.tail()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.8,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.2,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.6,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.4,306.6,Yes
7042,3186-AJIEK,Male,0,No,No,66,Yes,No,Fiber optic,Yes,...,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),105.65,6844.5,No


In [5]:
customer_data.shape

(7043, 21)

In [6]:
customer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [7]:
customer_data.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [8]:
customer_data.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

### Converting the Total Charges column to numeric data type

In [9]:
customer_data['TotalCharges'] = customer_data['TotalCharges'].apply(pd.to_numeric, errors='coerce')

In [10]:
customer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [11]:
customer_data.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

### Filling null values with zero(0)

In [12]:
customer_data.fillna(0, inplace=True)

In [13]:
customer_data.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

### Converting Churn Column to binary values

In [14]:
label = {'No': 0 ,"Yes" : 1}
customer_data['Churn'].replace(label, inplace= True)

In [17]:
customer_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [18]:
customer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


### Splitting data into train and test

In [19]:
x = customer_data.drop(['Churn', 'customerID'], axis=1)
y = customer_data['Churn']

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

### Classifying variables into categorical and numerical variables

In [21]:
categorical_columns_train = x_train.select_dtypes(include=['object']).columns

print(categorical_columns_train)

Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod'],
      dtype='object')


In [22]:
categorical_columns_test = x_test.select_dtypes(include=['object']).columns

print(categorical_columns_test)

Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod'],
      dtype='object')


In [23]:
numerical_columns_train = x_train.select_dtypes(include=np.number).columns

print(numerical_columns_train)

#df._get_numeric_data().columns
#dfnew = df.select_dtypes(include=np.number)

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'], dtype='object')


In [24]:
numerical_columns_test = x_test.select_dtypes(include=np.number).columns

print(numerical_columns_test)

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'], dtype='object')


### Feature engineering

In [25]:
train_cols = x_train.columns
test_cols = x_test.columns

### Scaling numerical values using standard scaler

In [26]:
scaler = StandardScaler()

In [27]:
x_train_scaled = scaler.fit_transform(
           x_train[['SeniorCitizen', 'tenure', 'MonthlyCharges',
                  'TotalCharges']])
x_test_scaled = scaler.transform(
           x_test[['SeniorCitizen', 'tenure', 'MonthlyCharges',
                  'TotalCharges']])

In [28]:
x_train_scaled

array([[-0.43947526, -0.82588395, -1.49752994, -0.89094683],
       [-0.43947526,  0.39596106,  0.30299568,  0.38969269],
       [-0.43947526,  1.57707791,  0.01232042,  1.06094513],
       ...,
       [-0.43947526, -0.17423328,  1.35939264,  0.30980164],
       [-0.43947526, -1.23316563, -0.3447949 , -0.95459932],
       [-0.43947526, -0.66297128,  0.11530251, -0.51300329]])

In [29]:
x_test_scaled

array([[-0.43947526,  0.3552329 ,  0.50065486,  0.46038335],
       [ 2.27544096,  1.37343708,  1.24976654,  1.85085376],
       [-0.43947526, -0.82588395, -0.65706318, -0.77357032],
       ...,
       [-0.43947526,  1.57707791, -0.35974391,  0.71097779],
       [ 2.27544096,  1.33270891,  0.69665304,  1.50516669],
       [-0.43947526, -0.62224312,  1.30956259, -0.235602  ]])

### Converting output of scaled numerical values to a dataframe

In [30]:
x_train_scaled_df = pd.DataFrame(x_train_scaled)
x_test_scaled_df = pd.DataFrame(x_test_scaled)

In [31]:
x_train_scaled_df.shape

(5634, 4)

In [32]:
x_test_scaled_df.shape

(1409, 4)

### Renaming the columns

In [33]:
x_train_scaled_df.columns = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
x_test_scaled_df.columns = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

In [34]:
x_test_scaled_df

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,-0.439475,0.355233,0.500655,0.460383
1,2.275441,1.373437,1.249767,1.850854
2,-0.439475,-0.825884,-0.657063,-0.773570
3,-0.439475,-1.110981,-0.471031,-0.894653
4,-0.439475,-0.907340,0.037235,-0.713691
...,...,...,...,...
1404,2.275441,-0.296418,1.367698,0.166721
1405,-0.439475,-0.703699,-1.500852,-0.858624
1406,-0.439475,1.577078,-0.359744,0.710978
1407,2.275441,1.332709,0.696653,1.505167


### Encoding categorical variables using one-hot encoder

In [35]:
columns_to_encode_train = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod']

columns_to_encode_test = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod']

# Create a OneHotEncoder object
ohe = OneHotEncoder()

# Fit and transform the columns using the OneHotEncoder
encoded_columns_train = ohe.fit_transform(x_train[columns_to_encode_train])
encoded_columns_test = ohe.transform(x_test[columns_to_encode_test])

# Create a new DataFrame with the encoded columns
encoded_df_train = pd.DataFrame(encoded_columns_train.toarray(), columns=ohe.get_feature_names_out(columns_to_encode_train))
encoded_df_test = pd.DataFrame(encoded_columns_test.toarray(), columns=ohe.get_feature_names_out(columns_to_encode_test))


# Drop the original columns from the original DataFrame
x_train.drop(columns_to_encode_train, axis=1, inplace=True)
x_test.drop(columns_to_encode_test, axis=1, inplace=True)

# Concatenate the original DataFrame with the new encoded DataFrame

X_train = pd.concat([x_train_scaled_df, encoded_df_train], axis=1)

# Concatenate the original DataFrame with the new encoded DataFrame

X_test = pd.concat([x_test_scaled_df, encoded_df_test], axis=1)    

In [36]:
X_train

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,-0.439475,-0.825884,-1.497530,-0.890947,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1,-0.439475,0.395961,0.302996,0.389693,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,-0.439475,1.577078,0.012320,1.060945,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
3,-0.439475,1.577078,0.686687,1.775397,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,-0.439475,-0.092777,0.186726,-0.102671,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,2.275441,-0.948068,1.186648,-0.599602,0.0,1.0,1.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5630,-0.439475,1.129068,-1.489225,-0.479886,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
5631,-0.439475,-0.174233,1.359393,0.309802,0.0,1.0,1.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5632,-0.439475,-1.233166,-0.344795,-0.954599,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [37]:
X_train.isnull().sum()

SeniorCitizen                              0
tenure                                     0
MonthlyCharges                             0
TotalCharges                               0
gender_Female                              0
gender_Male                                0
Partner_No                                 0
Partner_Yes                                0
Dependents_No                              0
Dependents_Yes                             0
PhoneService_No                            0
PhoneService_Yes                           0
MultipleLines_No                           0
MultipleLines_No phone service             0
MultipleLines_Yes                          0
InternetService_DSL                        0
InternetService_Fiber optic                0
InternetService_No                         0
OnlineSecurity_No                          0
OnlineSecurity_No internet service         0
OnlineSecurity_Yes                         0
OnlineBackup_No                            0
OnlineBack

In [38]:
X_train.shape

(5634, 45)

In [39]:
y_train.shape

(5634,)

In [40]:
names = ["Extra Trees", "Random Forest", "LightGBM", "XGBoost"]

In [41]:
classifiers = [
    ExtraTreesClassifier(max_depth=3, min_samples_split=2, min_samples_leaf=1, random_state=1),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, random_state=1),
    lgb.LGBMClassifier(verbosity=-1, random_state=1),
    xgb.XGBClassifier(random_state=1)
   ]

In [42]:
accuracy_scores = []

# iterate over classifiers and predict accuracy
for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    score = round(score, 4)
    accuracy_scores.append(score)
    print(name ,' : ' , score)

Extra Trees  :  0.7942
Random Forest  :  0.7821
LightGBM  :  0.8034
XGBoost  :  0.7935


In [43]:
classifiers_performance = pd.DataFrame({"Classifiers": names, "Accuracy Scores": accuracy_scores})
classifiers_performance

Unnamed: 0,Classifiers,Accuracy Scores
0,Extra Trees,0.7942
1,Random Forest,0.7821
2,LightGBM,0.8034
3,XGBoost,0.7935


In [44]:
classifiers1 = [
    ExtraTreesClassifier(random_state=1),
    RandomForestClassifier(random_state=1),
    lgb.LGBMClassifier(random_state=1),
    xgb.XGBClassifier(random_state=1)
   ]

In [45]:
accuracy_scores1 = []

# iterate over classifiers and predict accuracy
for name, clf in zip(names, classifiers1):
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    score = round(score, 4)
    accuracy_scores.append(score)
    print(name ,' : ' , score)

Extra Trees  :  0.7679
Random Forest  :  0.7942
[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002662 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 669
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
LightGBM  :  0.8034
XGBoost  :  0.7935
