# Objective :

"Predict behavior to retain customers. You can analyze all relevant customer data and develop focused customer retention programs."

# Step 1 :  Import Library and Dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read the data in
employee = pd.read_csv(r"churn.csv")

In [3]:
employee.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,No,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,No,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,No,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,No,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer,42.3,1840.75,No
4,9237-HQITU,Female,No,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


# Step 2 : Data Pre-Processing

### Univariate Analysis

In [4]:
employee.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   object 
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [5]:
employee.describe()

Unnamed: 0,tenure,MonthlyCharges
count,7043.0,7043.0
mean,32.371149,64.761692
std,24.559481,30.090047
min,0.0,18.25
25%,9.0,35.5
50%,29.0,70.35
75%,55.0,89.85
max,72.0,118.75


### Removing Irrelavent Variable

In [6]:
employee = employee.drop(['customerID'],axis=1)
employee.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [7]:
#Replacing spaces with null values in total charges column
employee['TotalCharges'] =employee["TotalCharges"].replace(" ",np.nan).astype(float) 
# string cannot be convert float direclty 

In [8]:
employee.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   object 
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


### Checking Missing Value

In [9]:
# Do we have NA's in data
employee.isna().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [10]:
employee.TotalCharges.fillna(employee.TotalCharges.mean(),inplace=True) # one column at a time bb

In [11]:
# Do we have NA's in data
employee.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [12]:
employee.describe() # describe works for number by default 

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
count,7043.0,7043.0,7043.0
mean,32.371149,64.761692,2283.300441
std,24.559481,30.090047,2265.000258
min,0.0,18.25,18.8
25%,9.0,35.5,402.225
50%,29.0,70.35,1400.55
75%,55.0,89.85,3786.6
max,72.0,118.75,8684.8


In [13]:
employee.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,No,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,No,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,No,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer,42.3,1840.75,No
4,Female,No,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [14]:
employee.OnlineSecurity.value_counts(ascending=False)

No                     3498
Yes                    2019
No internet service    1526
Name: OnlineSecurity, dtype: int64

In [15]:
3498+1526

5024

In [16]:
employee.OnlineSecurity=employee.OnlineSecurity.replace({'No internet service' : 'No'})

In [17]:
employee.OnlineSecurity.value_counts()

No     5024
Yes    2019
Name: OnlineSecurity, dtype: int64

In [18]:
employee.OnlineBackup=employee.OnlineBackup.replace({'No internet service' : 'No'})
employee.DeviceProtection=employee.DeviceProtection.replace({'No internet service' : 'No'})
employee.TechSupport=employee.TechSupport.replace({'No internet service' : 'No'})
employee.StreamingTV=employee.StreamingTV.replace({'No internet service' : 'No'})
employee.StreamingMovies=employee.StreamingMovies.replace({'No internet service' : 'No'})
employee.MultipleLines=employee.MultipleLines.replace({'No phone service' : 'No'})

# Churn Rate Analysis

In [None]:
import plotly.express as px

fig = px.pie(employee,names='Churn',color='Churn',
             color_discrete_map={'Yes':'red',
                                 'No':'green'})
fig.show()

# Trend Analysis

In [20]:
employee.Churn.value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [21]:
1869/7043

0.2653698707936959

In [22]:
Churn_Customer= employee[employee["Churn"] == "Yes"]

In [23]:
fig = px.sunburst(Churn_Customer, path=["SeniorCitizen",'InternetService',
                                        "Contract", "PaymentMethod"])
fig.show()

#### Conclusion :- Customer Trend Analysis 

* Customer who leave the service are 
* Not senior Citizen , use Fiber Optic & Month-to-Month & Electronic Check 

### Taking subset data of Number 

In [24]:
employee.select_dtypes(include=[np.number]).columns.tolist()

['tenure', 'MonthlyCharges', 'TotalCharges']

In [25]:
# #Employee Numeric columns
employee_num = employee[employee.select_dtypes(include=[np.number]).columns.tolist()]
employee_num.head(3)

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,1,29.85,29.85
1,34,56.95,1889.5
2,2,53.85,108.15


### Taking subset data of Category 

In [26]:
employee_dummies = employee[employee.select_dtypes(include=['object']).columns.tolist()]
employee_dummies.head(3)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn
0,Female,No,Yes,No,No,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,No
1,Male,No,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,No
2,Male,No,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,Yes


### Converting Quality Variable to Number

In [27]:
from sklearn.preprocessing import LabelEncoder
employee_dummies=employee_dummies.apply(LabelEncoder().fit_transform)
employee_dummies.head(3)
# label in ascending order

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn
0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,2,0
1,1,0,0,0,1,0,0,1,0,1,0,0,0,1,0,3,0
2,1,0,0,0,1,0,0,1,1,0,0,0,0,0,1,3,1


### Combine to Dataset

In [28]:
employee_combined = pd.concat([employee_num, employee_dummies],axis=1)

In [29]:
employee_combined.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn
0,1,29.85,29.85,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,2,0
1,34,56.95,1889.5,1,0,0,0,1,0,0,1,0,1,0,0,0,1,0,3,0
2,2,53.85,108.15,1,0,0,0,1,0,0,1,1,0,0,0,0,0,1,3,1
3,45,42.3,1840.75,1,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0
4,2,70.7,151.65,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,2,1


# Step 3: Data Partition

In [30]:
#Dividing data into train and test dataset
from sklearn.model_selection import train_test_split
#from random import seed

#seed(20)
x = employee_combined.drop(['Churn'],axis=1)
y = employee_combined[['Churn']]

# Train test split

X_train, X_test, y_train, y_test =train_test_split(x,y,test_size=0.3,random_state=231)

# Step 4: Model Building

In [31]:
#Import Tree Classifier model
from sklearn import tree

dt = tree.DecisionTreeClassifier()  # by default it use Gini index for split
#Train the model using the training sets
dt.fit(X_train,y_train)  # Model = dt

DecisionTreeClassifier()

# Step 5: Plotting the Tree 

# Ploting Tree
import graphviz 
from six import StringIO
#from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
import pydot

In [32]:
train=pd.concat([y_train,X_train],axis=1)
train.head()

Unnamed: 0,Churn,tenure,MonthlyCharges,TotalCharges,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod
1583,0,6,48.95,273.25,0,0,1,1,1,1,0,0,0,0,0,0,0,0,1,1
6791,1,19,39.65,733.35,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,2
4812,1,9,66.25,620.55,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0,3
6282,0,4,19.55,68.8,1,0,1,1,1,0,2,0,0,0,0,0,0,2,0,1
2479,0,56,75.85,4261.2,1,0,1,1,1,1,0,1,1,0,1,0,1,2,1,1


In [33]:
features = list(train.columns[1:])
features

['tenure',
 'MonthlyCharges',
 'TotalCharges',
 'gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']

In [34]:
from sklearn import tree
import matplotlib.pyplot as plt

churn=['No', 'Yes']  # array
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (5,4), dpi=300)
tree.plot_tree(dt,  # Model 
               feature_names = features,  # column name
               class_names=churn, # Yes , No
               filled = True, # colour
              node_ids=True, # node number
              fontsize=2); # 
#fig.savefig('imagename.png')

Error in callback <function flush_figures at 0x000002583FD16C10> (for post_execute):


KeyboardInterrupt: 

### Strategy & Prediction
* Contract = Month-to-Month & Monthly Charges > 68 & Tenure <= 15.5 

# Step 6 : Predictions on Train Dataset

In [35]:
train.head()

Unnamed: 0,Churn,tenure,MonthlyCharges,TotalCharges,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod
1583,0,6,48.95,273.25,0,0,1,1,1,1,0,0,0,0,0,0,0,0,1,1
6791,1,19,39.65,733.35,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,2
4812,1,9,66.25,620.55,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0,3
6282,0,4,19.55,68.8,1,0,1,1,1,0,2,0,0,0,0,0,0,2,0,1
2479,0,56,75.85,4261.2,1,0,1,1,1,1,0,1,1,0,1,0,1,2,1,1


In [36]:
train['Predicted']=dt.predict(X_train)  # MODEL = dt
train.head()

Unnamed: 0,Churn,tenure,MonthlyCharges,TotalCharges,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,...,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Predicted
1583,0,6,48.95,273.25,0,0,1,1,1,1,...,0,0,0,0,0,0,0,1,1,0
6791,1,19,39.65,733.35,1,0,0,0,0,0,...,0,0,1,0,1,0,0,1,2,1
4812,1,9,66.25,620.55,0,0,0,0,1,0,...,0,1,0,1,1,0,0,0,3,1
6282,0,4,19.55,68.8,1,0,1,1,1,0,...,0,0,0,0,0,0,2,0,1,0
2479,0,56,75.85,4261.2,1,0,1,1,1,1,...,1,1,0,1,0,1,2,1,1,0


# Step 7 : Model Performance Metrics

In [37]:
from sklearn.metrics import confusion_matrix  
matrix = confusion_matrix(train['Predicted'],train['Churn'])
print(matrix)

[[3616    7]
 [   1 1306]]


#### Final accuracy of Model Before Pruning 

In [38]:
Accuracy_Train=((3616+1306)/(4930)*100)
print(Accuracy_Train)   # overfit or high accuracy

99.83772819472617


#### Final accuracy of Model after Pruning 

In [39]:
Accuracy_Train=((3225+717)/(4930)*100)
print(Accuracy_Train)

79.95943204868155


# Step 8 : Predictions on Test Dataset

In [40]:
test=pd.concat([X_test,y_test],axis=1)
test.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn
1358,10,70.15,735.5,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,3,0
5471,29,74.2,1993.25,0,0,0,0,1,1,1,0,0,0,0,0,0,0,1,2,0
2693,72,19.3,1414.8,1,0,0,0,1,0,2,0,0,0,0,0,0,2,0,0,0
1077,41,114.5,4527.45,0,0,0,0,1,1,1,1,1,1,1,1,1,0,1,0,1
6663,1,54.65,54.65,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,2,0


In [41]:
test['Predicted']=dt.predict(X_test)
test.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,...,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn,Predicted
1358,10,70.15,735.5,1,0,0,0,1,0,1,...,0,0,0,0,0,0,0,3,0,0
5471,29,74.2,1993.25,0,0,0,0,1,1,1,...,0,0,0,0,0,0,1,2,0,1
2693,72,19.3,1414.8,1,0,0,0,1,0,2,...,0,0,0,0,0,2,0,0,0,0
1077,41,114.5,4527.45,0,0,0,0,1,1,1,...,1,1,1,1,1,0,1,0,1,1
6663,1,54.65,54.65,0,0,0,0,1,0,0,...,0,0,0,1,0,0,1,2,0,1


# Step 9 : Model Performance Metrics on Test data 

In [42]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(test['Predicted'],test['Churn'])
print(confusion_matrix)

[[1258  267]
 [ 299  289]]


In [43]:
Accuracy_test=((1372+294)/(2113)*100)
Accuracy_test

78.84524372929485

# Sensitivity & Specificity

#### Train

In [44]:
from sklearn.metrics import classification_report
print(classification_report(train['Churn'], train['Predicted']))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3617
           1       1.00      0.99      1.00      1313

    accuracy                           1.00      4930
   macro avg       1.00      1.00      1.00      4930
weighted avg       1.00      1.00      1.00      4930



#### Test

In [45]:
from sklearn.metrics import classification_report
print(classification_report(test['Churn'], test['Predicted']))

              precision    recall  f1-score   support

           0       0.82      0.81      0.82      1557
           1       0.49      0.52      0.51       556

    accuracy                           0.73      2113
   macro avg       0.66      0.66      0.66      2113
weighted avg       0.74      0.73      0.73      2113



### Model Improvement by Pruning Method ( Cut Tree)

In [46]:
#Import Tree Classifier model
from sklearn import tree

dt = tree.DecisionTreeClassifier(criterion='gini',  #splitter
                                 min_samples_leaf=50, ## child
                                 min_samples_split=200, #parent 
                                 max_depth=3)  #branches
#Train the model using the training sets
dt.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=3, min_samples_leaf=50, min_samples_split=200)

### After Running this go back then run from  Plotting the Graph

# Grid Search Method 

In [47]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
params = { 'min_samples_split': [100, 200,270], ## Parent 
           'min_samples_leaf': [50,70,80], ## Child or Leave 
           'max_depth':[3,4,6]}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42),
                              params, 
                              verbose=1, 
                              cv=10)
grid_search_cv.fit(X_train, y_train)

Fitting 10 folds for each of 27 candidates, totalling 270 fits


GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=42),
             param_grid={'max_depth': [3, 4, 6],
                         'min_samples_leaf': [50, 70, 80],
                         'min_samples_split': [100, 200, 270]},
             verbose=1)

100,50,3  # first Combination
100,50,4  # Second Combination
100,50,6 # Third Combination
100,70,3 
100,70,4m
100,70,6
150,30,4
150,30,5
150,30,6
150,50,4
150,50,5
150,50,6

In [48]:
grid_search_cv.best_estimator_

DecisionTreeClassifier(max_depth=6, min_samples_leaf=50, min_samples_split=200,
                       random_state=42)