In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# URL of the dataset
url = 'https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv'

# Load the dataset
data = pd.read_csv(url)

data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
data.drop(columns=["customerID"],inplace=True)

# Saperate Feature and Label 

In [5]:
feature=data.drop(columns=["Churn"])
label=data['Churn']

In [6]:
dic={
    "Yes":1,
    "No":0
}
label=label.map(dic)
label

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7043, dtype: int64

# Train Test Split

In [7]:
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score

In [8]:
x_train,x_test,y_train,y_test=train_test_split(feature,label,test_size=0.2,random_state=43)

# Saperate Numerical and Categorical Columns

In [9]:
num_col=feature.select_dtypes("number").columns
cat_col=feature.select_dtypes("object").columns

# Build Pipeline

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.impute import SimpleImputer

In [11]:
num_pipe=Pipeline(steps=[
    ('impute',SimpleImputer(strategy="median")),
    ("Scale",StandardScaler())
])
num_pipe

In [12]:
cat_pipe=Pipeline(steps=[
    ("Impute",SimpleImputer(strategy="most_frequent")),
    ("Encode",OneHotEncoder(drop='first',handle_unknown="ignore",sparse=False))
])
cat_pipe

# Build Transformer

In [13]:
process=ColumnTransformer(transformers=[
    ("Num_trans",num_pipe,num_col),
    ("Cat_trans",cat_pipe,cat_col)
],remainder='passthrough',n_jobs=-1,verbose=True)
process

# Final Pipeline

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

In [15]:
pipe=Pipeline(steps=[
    ('process',process),
    ("Model",LogisticRegression(verbose=1,class_weight='balanced',solver='liblinear',n_jobs=-1))
#     ("model",DecisionTreeClassifier(max_depth=10,              
#                                     min_samples_split=10,     
#                                     min_samples_leaf=4,       
#                                     max_features='sqrt',      
#                                     min_impurity_decrease=0.01,
#                                     random_state=43,          
#                                     class_weight='balanced')),
#     ("Random",RandomForestClassifier())
])

In [16]:
pipe.fit(x_train,y_train)

[LibLinear]

In [17]:
pre=pipe.predict(x_test)

# Evulation Matrix

In [18]:
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,confusion_matrix

In [19]:
def evulation(actual,pre):
    score=accuracy_score(actual,pre)
    precession=precision_score(actual,pre)
    recall=recall_score(actual,pre)
    f_score=f1_score(actual,pre)
    matrix=confusion_matrix(actual,pre)
    
    return {
        "Accuracy":score,
        "Precession":precession,
        "recall":recall,
        "f_score":f_score,
        "Confussion_Matrix":matrix
    }

In [20]:
evulation(y_test,pre)

{'Accuracy': 0.7821149751596878,
 'Precession': 0.5609284332688588,
 'recall': 0.7837837837837838,
 'f_score': 0.6538895152198423,
 'Confussion_Matrix': array([[812, 227],
        [ 80, 290]], dtype=int64)}

In [21]:
evulation(y_test,pre)['Confussion_Matrix']

array([[812, 227],
       [ 80, 290]], dtype=int64)

# Cross Validation

In [22]:
train_score=cross_val_score(pipe,x_train,y_train,cv=5,scoring="accuracy")
test_score=cross_val_score(pipe,x_test,y_test,cv=5,scoring="accuracy")

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

In [23]:
print("Training Score is: ",train_score.mean())
print("Testing Score is: ",test_score.mean())

Training Score is:  0.7603852476197831
Testing Score is:  0.7799727417278716


# Experiments with Voting Classifier

In [24]:
from sklearn.ensemble import VotingClassifier

In [25]:
# Define the VotingClassifier
voting_clf = VotingClassifier(estimators=[
    ('LR', LogisticRegression(verbose=1,class_weight='balanced',solver='liblinear',n_jobs=-1)),
    ('DT', DecisionTreeClassifier(max_depth=10,              
                                    min_samples_split=10,     
                                    min_samples_leaf=4,       
                                    max_features='sqrt',      
                                    min_impurity_decrease=0.01,
                                    random_state=43,          
                                    class_weight='balanced')),
    ('RF', RandomForestClassifier())
], voting='soft')  # Use 'hard' for majority voting
voting_clf

In [27]:
pipe=Pipeline(steps=[
    ('process',process),
    ("Models",voting_clf)
#     ("model",DecisionTreeClassifier(max_depth=10,              
#                                     min_samples_split=10,     
#                                     min_samples_leaf=4,       
#                                     max_features='sqrt',      
#                                     min_impurity_decrease=0.01,
#                                     random_state=43,          
#                                     class_weight='balanced')),
#     ("Random",RandomForestClassifier())
])

In [30]:
pipe.fit(x_train,y_train)

[LibLinear]

In [31]:
pre=pipe.predict(x_test)
pre

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [32]:
evulation(y_test,pre)

{'Accuracy': 0.8062455642299503,
 'Precession': 0.6221662468513854,
 'recall': 0.6675675675675675,
 'f_score': 0.6440677966101694,
 'Confussion_Matrix': array([[889, 150],
        [123, 247]], dtype=int64)}

In [33]:
evulation(y_test,pre)['Confussion_Matrix']

array([[889, 150],
       [123, 247]], dtype=int64)

In [35]:
train_score=cross_val_score(pipe,x_train,y_train,cv=5,scoring="accuracy").mean()
test_score=cross_val_score(pipe,x_test,y_test,cv=5,scoring="accuracy").mean()

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

In [36]:
train_score

0.7909142775188691

In [37]:
test_score

0.8005502132707403