# Customer Churn Prediction

# Load Data

In [1]:
import pandas as pd

In [2]:
df=pd.read_excel("C:\\Users\\shree kalika\\Downloads\\customer_churn_large_dataset.xlsx")
df.head()

Unnamed: 0,CustomerID,Name,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,1,Customer_1,63,Male,Los Angeles,17,73.36,236,0
1,2,Customer_2,62,Female,New York,1,48.76,172,0
2,3,Customer_3,24,Female,Los Angeles,5,85.47,460,0
3,4,Customer_4,36,Female,Miami,3,97.94,297,1
4,5,Customer_5,46,Female,Miami,19,58.14,266,0


# EDA

In [3]:
df.corr()[['Churn']]

Unnamed: 0,Churn
CustomerID,-0.004586
Age,0.001559
Subscription_Length_Months,0.002328
Monthly_Bill,-0.000211
Total_Usage_GB,-0.002842
Churn,1.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   CustomerID                  100000 non-null  int64  
 1   Name                        100000 non-null  object 
 2   Age                         100000 non-null  int64  
 3   Gender                      100000 non-null  object 
 4   Location                    100000 non-null  object 
 5   Subscription_Length_Months  100000 non-null  int64  
 6   Monthly_Bill                100000 non-null  float64
 7   Total_Usage_GB              100000 non-null  int64  
 8   Churn                       100000 non-null  int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 6.9+ MB


In [5]:
df.isna().sum()

CustomerID                    0
Name                          0
Age                           0
Gender                        0
Location                      0
Subscription_Length_Months    0
Monthly_Bill                  0
Total_Usage_GB                0
Churn                         0
dtype: int64

# Defining X and Y

In [6]:
x=df.drop(['Churn'],axis=1)
y=df['Churn']

In [7]:
cat=[]
con=[]
for i in x.columns:
    if x[i].dtypes=='object':
        cat.append(i)
    else:
        con.append(i)

In [8]:
cat

['Name', 'Gender', 'Location']

In [9]:
con

['CustomerID',
 'Age',
 'Subscription_Length_Months',
 'Monthly_Bill',
 'Total_Usage_GB']

# Creating Pipeline

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [11]:
num_pipe=Pipeline(steps=[('impute',SimpleImputer(strategy='median')),('scaler',StandardScaler())])
cat_pipe=Pipeline(steps=[('impute',SimpleImputer(strategy="constant")),('encode',OrdinalEncoder())])
pre1=ColumnTransformer([('num_pipe',num_pipe,con),('cat_pipe',cat_pipe,cat)])
x1=pre1.fit_transform(x)
x1

array([[-1.73203349e+00,  1.24167039e+00,  6.51114987e-01, ...,
         0.00000000e+00,  1.00000000e+00,  2.00000000e+00],
       [-1.73199885e+00,  1.17622625e+00, -1.65887854e+00, ...,
         1.11120000e+04,  0.00000000e+00,  4.00000000e+00],
       [-1.73196421e+00, -1.31065114e+00, -1.08138015e+00, ...,
         2.22230000e+04,  0.00000000e+00,  2.00000000e+00],
       ...,
       [ 1.73196421e+00,  1.30711454e+00,  6.51114987e-01, ...,
         9.99980000e+04,  1.00000000e+00,  0.00000000e+00],
       [ 1.73199885e+00,  4.56340692e-01,  1.08423877e+00, ...,
         9.99990000e+04,  0.00000000e+00,  4.00000000e+00],
       [ 1.73203349e+00, -1.11431871e+00,  9.39864177e-01, ...,
         5.00000000e+00,  0.00000000e+00,  2.00000000e+00]])

In [12]:
col=pre1.get_feature_names_out()
col

array(['num_pipe__CustomerID', 'num_pipe__Age',
       'num_pipe__Subscription_Length_Months', 'num_pipe__Monthly_Bill',
       'num_pipe__Total_Usage_GB', 'cat_pipe__Name', 'cat_pipe__Gender',
       'cat_pipe__Location'], dtype=object)

In [13]:
x2=pd.DataFrame(x1,columns=col)
x2

Unnamed: 0,num_pipe__CustomerID,num_pipe__Age,num_pipe__Subscription_Length_Months,num_pipe__Monthly_Bill,num_pipe__Total_Usage_GB,cat_pipe__Name,cat_pipe__Gender,cat_pipe__Location
0,-1.732033,1.241670,0.651115,0.410606,-0.294289,0.0,1.0,2.0
1,-1.731999,1.176226,-1.658879,-0.805374,-0.784852,11112.0,0.0,4.0
2,-1.731964,-1.310651,-1.081380,1.009204,1.422681,22223.0,0.0,2.0
3,-1.731930,-0.525321,-1.370129,1.625597,0.173279,33334.0,0.0,3.0
4,-1.731895,0.129120,0.939864,-0.341720,-0.064338,44445.0,0.0,3.0
...,...,...,...,...,...,...,...,...
99995,1.731895,-0.721654,1.517363,-0.490504,-0.370939,99996.0,1.0,1.0
99996,1.731930,1.176226,0.939864,-0.168220,0.587191,99997.0,0.0,4.0
99997,1.731964,1.307115,0.651115,1.535140,-0.179313,99998.0,1.0,0.0
99998,1.731999,0.456341,1.084239,-0.781153,1.223390,99999.0,0.0,4.0


# Splitting The Data in Trainig And Testing

In [14]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x2,y,test_size=0.2,random_state=21)

In [15]:
x_train

Unnamed: 0,num_pipe__CustomerID,num_pipe__Age,num_pipe__Subscription_Length_Months,num_pipe__Monthly_Bill,num_pipe__Total_Usage_GB,cat_pipe__Name,cat_pipe__Gender,cat_pipe__Location
12309,-1.305637,-1.310651,0.362366,1.415025,-1.252419,2571.0,1.0,0.0
91699,1.444513,0.914450,-1.370129,-0.290313,-0.156318,90781.0,0.0,4.0
90457,1.401489,1.176226,0.073617,-0.426245,-0.815512,89400.0,1.0,3.0
78861,0.999792,-0.852542,1.517363,1.528714,-0.355609,76515.0,0.0,2.0
29219,-0.719858,-1.637872,-1.081380,-0.472215,1.652632,21359.0,0.0,2.0
...,...,...,...,...,...,...,...,...
81968,1.107421,-1.441539,-0.648256,-1.632834,-1.183434,79967.0,1.0,1.0
8964,-1.421511,-0.198101,-0.359507,0.939014,0.901458,88501.0,0.0,3.0
71480,0.744106,1.241670,-0.792631,0.066078,0.311249,68315.0,0.0,1.0
70863,0.722733,-1.637872,0.073617,-0.750507,1.430346,67629.0,0.0,1.0


# Model Building All Types In Classification

# 1.Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression
la=LogisticRegression()
la.fit(x_train,y_train)

# Training Data Evaluation

In [17]:
from warnings import filterwarnings
filterwarnings("ignore")

In [18]:
y_pred_train=la.predict(x_train)

from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print('*'*50)
l_acc=accuracy_score(y_pred_train,y_train)
print('Accuracy:',l_acc)
print('*'*50)
l_cnf=confusion_matrix(y_pred_train,y_train)
print('Confusion matrix:\n',l_cnf)
print('*'*50)
l_clf=classification_report(y_pred_train,y_train)
print('Classification Report:\n',l_clf)
print('*'*50)

**************************************************
Accuracy: 0.50315
**************************************************
Confusion matrix:
 [[40252 39748]
 [    0     0]]
**************************************************
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.50      0.67     80000
           1       0.00      0.00      0.00         0

    accuracy                           0.50     80000
   macro avg       0.50      0.25      0.33     80000
weighted avg       1.00      0.50      0.67     80000

**************************************************


# Testing Data Evaluation

In [19]:
y_pred=la.predict(x_test)
print('*'*50)
lts_acc=accuracy_score(y_pred,y_test)
print('Accuracy:',lts_acc)
print('*'*50)
lts_cnf=confusion_matrix(y_pred,y_test)
print('confusion_matrix:\n',lts_cnf)
print('*'*50)
lts_clf=classification_report(y_pred,y_test)
print('Classification_report:\n',lts_clf)
print('*'*50)

**************************************************
Accuracy: 0.4984
**************************************************
confusion_matrix:
 [[ 9968 10031]
 [    1     0]]
**************************************************
Classification_report:
               precision    recall  f1-score   support

           0       1.00      0.50      0.67     19999
           1       0.00      0.00      0.00         1

    accuracy                           0.50     20000
   macro avg       0.50      0.25      0.33     20000
weighted avg       1.00      0.50      0.67     20000

**************************************************


# 2.Decision Tree

In [20]:
from sklearn.tree import DecisionTreeClassifier
dtr=DecisionTreeClassifier(random_state=23)
dtr.fit(x_train,y_train)

# Training Data Eva

In [21]:
y_pred_train=dtr.predict(x_train)

print('*'*50)
d_acc=accuracy_score(y_pred_train,y_train)
print('Accuracy:',d_acc)
print('*'*50)
d_cnf=confusion_matrix(y_pred_train,y_train)
print('Confusion matrix:\n',d_cnf)
print('*'*50)
d_clf=classification_report(y_pred_train,y_train)
print('Classification Report:\n',d_clf)
print('*'*50)

**************************************************
Accuracy: 1.0
**************************************************
Confusion matrix:
 [[40252     0]
 [    0 39748]]
**************************************************
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     40252
           1       1.00      1.00      1.00     39748

    accuracy                           1.00     80000
   macro avg       1.00      1.00      1.00     80000
weighted avg       1.00      1.00      1.00     80000

**************************************************


# Testing Data Evaluation

In [22]:
y_pred=dtr.predict(x_test)
print('*'*50)
dts_acc=accuracy_score(y_pred,y_test)
print('Accuracy:',dts_acc)
print('*'*50)
dts_cnf=confusion_matrix(y_pred,y_test)
print('confusion_matrix:\n',dts_cnf)
print('*'*50)
dts_clf=classification_report(y_pred,y_test)
print('Classification_report:\n',dts_clf)
print('*'*50)

**************************************************
Accuracy: 0.5023
**************************************************
confusion_matrix:
 [[5071 5056]
 [4898 4975]]
**************************************************
Classification_report:
               precision    recall  f1-score   support

           0       0.51      0.50      0.50     10127
           1       0.50      0.50      0.50      9873

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.50     20000
weighted avg       0.50      0.50      0.50     20000

**************************************************


# 3.Using Random Forest

In [23]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(random_state=23)
rf.fit(x_train,y_train)

# Training Data Evaluation

In [24]:
y_pred_train=rf.predict(x_train)

print('*'*50)
r_acc=accuracy_score(y_pred_train,y_train)
print('Accuracy:',r_acc)
print('*'*50)
r_cnf=confusion_matrix(y_pred_train,y_train)
print('Confusion matrix:\n',r_cnf)
print('*'*50)
r_clf=classification_report(y_pred_train,y_train)
print('Classification Report:\n',r_clf)
print('*'*50)

**************************************************
Accuracy: 1.0
**************************************************
Confusion matrix:
 [[40252     0]
 [    0 39748]]
**************************************************
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     40252
           1       1.00      1.00      1.00     39748

    accuracy                           1.00     80000
   macro avg       1.00      1.00      1.00     80000
weighted avg       1.00      1.00      1.00     80000

**************************************************


# Testing Data Evaluation

In [25]:
y_pred=rf.predict(x_test)
print('*'*50)
rts_acc=accuracy_score(y_pred,y_test)
print('Accuracy:',rts_acc)
print('*'*50)
rts_cnf=confusion_matrix(y_pred,y_test)
print('confusion_matrix:\n',rts_cnf)
print('*'*50)
rts_clf=classification_report(y_pred,y_test)
print('Classification_report:\n',rts_clf)
print('*'*50)

**************************************************
Accuracy: 0.4998
**************************************************
confusion_matrix:
 [[5385 5420]
 [4584 4611]]
**************************************************
Classification_report:
               precision    recall  f1-score   support

           0       0.54      0.50      0.52     10805
           1       0.46      0.50      0.48      9195

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.50     20000
weighted avg       0.50      0.50      0.50     20000

**************************************************


# 4.Using Adaboost

In [26]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier(max_depth=2)
ab=AdaBoostClassifier(dt,random_state=23)
ab.fit(x_train,y_train)

# Training Data

In [27]:
y_pred_train=ab.predict(x_train)

print('*'*50)
a_acc=accuracy_score(y_pred_train,y_train)
print('Accuracy:',a_acc)
print('*'*50)
a_cnf=confusion_matrix(y_pred_train,y_train)
print('Confusion matrix:\n',a_cnf)
print('*'*50)
a_clf=classification_report(y_pred_train,y_train)
print('Classification Report:\n',a_clf)
print('*'*50)

**************************************************
Accuracy: 0.5257375
**************************************************
Confusion matrix:
 [[23730 21419]
 [16522 18329]]
**************************************************
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.53      0.56     45149
           1       0.46      0.53      0.49     34851

    accuracy                           0.53     80000
   macro avg       0.53      0.53      0.52     80000
weighted avg       0.53      0.53      0.53     80000

**************************************************


# Testing Data

In [28]:
y_pred=ab.predict(x_test)
print('*'*50)
ats_acc=accuracy_score(y_pred,y_test)
print('Accuracy:',ats_acc)
print('*'*50)
ats_cnf=confusion_matrix(y_pred,y_test)
print('confusion_matrix:\n',ats_cnf)
print('*'*50)
ats_clf=classification_report(y_pred,y_test)
print('Classification_report:\n',ats_clf)
print('*'*50)

**************************************************
Accuracy: 0.4975
**************************************************
confusion_matrix:
 [[5584 5665]
 [4385 4366]]
**************************************************
Classification_report:
               precision    recall  f1-score   support

           0       0.56      0.50      0.53     11249
           1       0.44      0.50      0.46      8751

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.50     20000
weighted avg       0.51      0.50      0.50     20000

**************************************************


# 5.Using KNN

In [29]:
from sklearn.neighbors import KNeighborsClassifier
kn=KNeighborsClassifier()
kn.fit(x_train,y_train)

# Trainig Data

In [30]:
y_pred_train=kn.predict(x_train)
print('*'*50)
k_acc=accuracy_score(y_pred_train,y_train)
print('Accuracy:',k_acc)
print('*'*50)
k_cnf=confusion_matrix(y_pred_train,y_train)
print('COnfusion_matrix:\n',k_cnf)
print('*'*50)
k_clf=classification_report(y_pred_train,y_train)
print('Classification_report:\n',k_clf)
print('*'*50)

**************************************************
Accuracy: 0.687325
**************************************************
COnfusion_matrix:
 [[27999 12761]
 [12253 26987]]
**************************************************
Classification_report:
               precision    recall  f1-score   support

           0       0.70      0.69      0.69     40760
           1       0.68      0.69      0.68     39240

    accuracy                           0.69     80000
   macro avg       0.69      0.69      0.69     80000
weighted avg       0.69      0.69      0.69     80000

**************************************************


# Testing Data

In [31]:
y_pred=kn.predict(x_test)
print('*'*50)
kts_acc=accuracy_score(y_pred,y_test)
print('Accuracy:',kts_acc)
print('*'*50)
kts_cnf=confusion_matrix(y_pred,y_test)
print('Confusion_matrix:\n',kts_cnf)
print('*'*50)
kts_clf=classification_report(y_pred,y_test)
print('Classification_report:\n',kts_clf)
print('*'*50)

**************************************************
Accuracy: 0.5047
**************************************************
Confusion_matrix:
 [[5037 4974]
 [4932 5057]]
**************************************************
Classification_report:
               precision    recall  f1-score   support

           0       0.51      0.50      0.50     10011
           1       0.50      0.51      0.51      9989

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.50     20000
weighted avg       0.50      0.50      0.50     20000

**************************************************


# Tunning The Best Model

#Decision Tree Training Accuracy: 100% ---- To Decrease


#Decision Tree Testing Accuracy : 50% ----- To Increase

In [32]:
from sklearn.model_selection import RandomizedSearchCV
grid={
    'min_samples_leaf':range(1,30),
    
    'criterion':['gini'],
    'max_depth':range(1,50,5),
    'min_samples_split':range(1,200)
    
}

In [33]:
grid

{'min_samples_leaf': range(1, 30),
 'criterion': ['gini'],
 'max_depth': range(1, 50, 5),
 'min_samples_split': range(1, 200)}

In [34]:
dtr1=RandomizedSearchCV(dtr,param_distributions=grid,cv=10)
dtr1.fit(x_train,y_train)

In [35]:
dtr1.best_params_

{'min_samples_split': 81,
 'min_samples_leaf': 2,
 'max_depth': 1,
 'criterion': 'gini'}

In [38]:
dtr2=DecisionTreeClassifier(random_state=121,min_samples_leaf=2,
 min_samples_split= 81,
 max_depth= 1,
 criterion= 'gini')

In [39]:
dtr2.fit(x_train,y_train)

# Trainig Data

In [43]:
y_pred_train=dtr2.predict(x_train)
print('*'*50)
dtr2_acc=accuracy_score(y_pred_train,y_train)
print('Accuracy:',dtr2_acc)
print('*'*50)
dtr2_cnf=confusion_matrix(y_pred_train,y_train)
print('COnfusion_matrix:\n',dtr2_cnf)
print('*'*50)
dtr2_clf=classification_report(y_pred_train,y_train)
print('Classification_report:\n',dtr2_clf)
print('*'*50)

**************************************************
Accuracy: 0.5056625
**************************************************
COnfusion_matrix:
 [[35530 34825]
 [ 4722  4923]]
**************************************************
Classification_report:
               precision    recall  f1-score   support

           0       0.88      0.51      0.64     70355
           1       0.12      0.51      0.20      9645

    accuracy                           0.51     80000
   macro avg       0.50      0.51      0.42     80000
weighted avg       0.79      0.51      0.59     80000

**************************************************


# Testing Data

In [42]:
y_pred=dtr2.predict(x_test)
print('*'*50)
dtr12_acc=accuracy_score(y_pred,y_test)
print('Accuracy:',dtr12_acc)
print('*'*50)
dtr12_cnf=confusion_matrix(y_pred,y_test)
print('Confusion_matrix:\n',dtr12_cnf)
print('*'*50)
dtr12_clf=classification_report(y_pred,y_test)
print('Classification_report:\n',dtr12_clf)
print('*'*50)

**************************************************
Accuracy: 0.4992
**************************************************
Confusion_matrix:
 [[8783 8830]
 [1186 1201]]
**************************************************
Classification_report:
               precision    recall  f1-score   support

           0       0.88      0.50      0.64     17613
           1       0.12      0.50      0.19      2387

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.42     20000
weighted avg       0.79      0.50      0.58     20000

**************************************************
