## Import required libraries

In [116]:
# Dash dependencies import
import dash
import dash_core_components as dcc
import dash_html_components as html
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
px.defaults.template = "ggplot2"
# End Dash dependencies import

# Data preprocessing 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
# ML Algorithm
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
# Model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score,confusion_matrix,roc_curve,roc_auc_score
# Save model
import os
import joblib

## Load data

In [117]:
df=pd.read_csv("../datasets/telco-customer-churn.csv")

## Explore data

Peak first 5 records

In [118]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0.0,Yes,No,1.0,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0.0,No,No,34.0,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0.0,No,No,2.0,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0.0,No,No,45.0,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0.0,No,No,2.0,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [119]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [120]:
df.head(1).to_json('sample_data.json')

Check records and features

In [121]:
df.shape

(2244, 21)

Convert Data Types

In [122]:
df.dtypes

customerID           object
gender               object
SeniorCitizen       float64
Partner              object
Dependents           object
tenure              float64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [123]:
df['TotalCharges']=pd.to_numeric(df['TotalCharges'], errors='coerce')

In [124]:
df.dtypes

customerID           object
gender               object
SeniorCitizen       float64
Partner              object
Dependents           object
tenure              float64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

Total Customers

In [125]:
df.shape[0]

2244

Customers Churned

In [126]:
df[df['Churn']=='Yes']['customerID'].count()

589

Remained Customers

In [127]:
df[df['Churn']=='No']['customerID'].count()

1654

Churned Customer Revenue

In [128]:
df[df['Churn']=='Yes']['TotalCharges'].sum()

897135.8

Remained Customer Revenue

In [129]:
df[df['Churn']=='No']['TotalCharges'].sum()

4231231.65

Statistical Summary

In [130]:
data_summary_df=pd.DataFrame(df.describe())
data_summary_df.reset_index(level=0, inplace=True)
data_summary_df=data_summary_df.drop(columns='SeniorCitizen')
data_summary_df.columns=['Metric','Tenure','MonthlyCharges','TotalCharges']
data_summary_df

Unnamed: 0,Metric,Tenure,MonthlyCharges,TotalCharges
0,count,2243.0,2243.0,2238.0
1,mean,32.398128,65.365916,2291.495733
2,std,24.584209,29.837483,2251.815762
3,min,0.0,18.4,18.8
4,25%,9.0,39.55,413.525
5,50%,29.0,71.15,1415.425
6,75%,56.0,89.85,3870.2875
7,max,72.0,118.65,8564.75


In [131]:
data_summary = go.Figure(data=[go.Table(header=dict(values=list(data_summary_df.columns),fill_color='paleturquoise',
                align='left'),cells=dict(values=[data_summary_df.Metric, data_summary_df.Tenure, data_summary_df.MonthlyCharges, data_summary_df.TotalCharges],
               fill_color='lavender',align='left'))])
data_summary.update_layout(showlegend=False,autosize=True,margin=dict(t=0,b=0,l=0,r=0),height=350)

Correlation

In [132]:
df_correlation=df[['tenure','MonthlyCharges','TotalCharges']].corr()
df_correlation

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
tenure,1.0,0.229158,0.825123
MonthlyCharges,0.229158,1.0,0.638747
TotalCharges,0.825123,0.638747,1.0


In [133]:
churn_correlation_df=px.imshow(df_correlation,title='Tenure, Monthly and Total Charges Correlation')
churn_correlation_df.update_layout(legend=dict(yanchor="top",y=0.95,xanchor="left",x=0.40),autosize=True,margin=dict(t=30,b=0,l=0,r=0))

Attrition

In [134]:
attrition_df=df.groupby( [ "Churn"], as_index=False )["customerID"].count()

In [135]:
attrition_df.head()

Unnamed: 0,Churn,customerID
0,No,1654
1,Yes,589


In [136]:
colors = ['skyblue','crimson']
doughnut_attrition = go.Figure(data=[go.Pie(labels=attrition_df['Churn'].tolist(), values=attrition_df['customerID'].tolist(), hole=.3)])
doughnut_attrition.update_layout(title={'text': 'Customer Churn Distribution','y':0.9,'x':0.5, 'xanchor': 'center','yanchor': 'top'},
showlegend=False,autosize=True,annotations=[dict(text='Attrition',  font_size=20, showarrow=False)],margin=dict(t=50,b=0,l=0,r=0),height=350,colorway=colors)

Attrition by Revenue

In [137]:
totalcharges_attrition_df=df.groupby( ["Churn"], as_index=False )["TotalCharges"].sum()
totalcharges_attrition_df=totalcharges_attrition_df.sort_values(by=['TotalCharges'],ascending=True)
totalcharges_attrition_df.columns=['Churn','Revenue']
totalcharges_attrition_df

Unnamed: 0,Churn,Revenue
1,Yes,897135.8
0,No,4231231.65


In [138]:
colors = ['crimson','skyblue']
contract_barchart=px.bar(totalcharges_attrition_df,x='Churn',y='Revenue',color='Churn',text='Revenue',color_discrete_sequence=colors,
                        title='Churn by Revenue')
contract_barchart.update_layout(legend=dict(yanchor="top",y=0.95,xanchor="left",x=0.40),autosize=True,margin=dict(t=30,b=0,l=0,r=0))

Churn by PaymentMethod

In [139]:
PaymentMethod_attrition_df=df.groupby( [ "Churn","PaymentMethod"], as_index=False )["customerID"].count()
PaymentMethod_attrition_df

Unnamed: 0,Churn,PaymentMethod,customerID
0,No,Bank transfer (automatic),428
1,No,Credit card (automatic),427
2,No,Electronic check,404
3,No,Mailed check,395
4,Yes,Bank transfer (automatic),94
5,Yes,Credit card (automatic),64
6,Yes,Electronic check,349
7,Yes,Mailed check,82


In [140]:
PaymentMethod_base_df=df.groupby(["PaymentMethod"], as_index=False )["customerID"].count()
PaymentMethod_base_df['Churn']='Customer Base'
PaymentMethod_base_df

Unnamed: 0,PaymentMethod,customerID,Churn
0,Bank transfer (automatic),522,Customer Base
1,Credit card (automatic),491,Customer Base
2,Electronic check,753,Customer Base
3,Mailed check,477,Customer Base


In [141]:
PaymentMethod_attrition_df=PaymentMethod_attrition_df.append(PaymentMethod_base_df, ignore_index = True) 
PaymentMethod_attrition_df.columns=['Churn','PaymentMethod','Customers']
PaymentMethod_attrition_df=PaymentMethod_attrition_df.sort_values(by=['PaymentMethod', 'Customers'],ascending=True)
PaymentMethod_attrition_df

Unnamed: 0,Churn,PaymentMethod,Customers
4,Yes,Bank transfer (automatic),94
0,No,Bank transfer (automatic),428
8,Customer Base,Bank transfer (automatic),522
5,Yes,Credit card (automatic),64
1,No,Credit card (automatic),427
9,Customer Base,Credit card (automatic),491
6,Yes,Electronic check,349
2,No,Electronic check,404
10,Customer Base,Electronic check,753
7,Yes,Mailed check,82


In [171]:
colors = ['crimson','skyblue','teal']
techsupport_barchart=px.bar(PaymentMethod_attrition_df,x='PaymentMethod',y='Customers',color='Churn',text='Customers',color_discrete_sequence=colors,barmode="group", 
                           title='Churn by Payment Method')
techsupport_barchart.update_layout(legend=dict(yanchor="top",y=0.95,xanchor="left",x=0.40),autosize=True,margin=dict(t=30,b=0,l=0,r=0)) #use barmode='stack' when stacking,

Attrition by Gender

In [143]:
gender_attrition_df=df.groupby( [ "Churn","gender"], as_index=False )["customerID"].count()
gender_attrition_df.columns=['Churn','Gender','Customers']
gender_attrition_df

Unnamed: 0,Churn,Gender,Customers
0,No,Female,792
1,No,Male,862
2,Yes,Female,300
3,Yes,Male,289


In [172]:
colors = ['skyblue','crimson']
grouped_barchart=px.bar(gender_attrition_df,x='Gender',y='Customers',color='Churn',text='Customers',color_discrete_sequence=colors,title='Churn by Gender')
grouped_barchart.update_layout(legend=dict(yanchor="top",y=0.95,xanchor="left",x=0.46),autosize=True,margin=dict(t=30,b=0,l=0,r=0)) #use barmode='stack' when stacking,

Churn by Tech Support

In [145]:
techsupport_attrition_df=df.groupby( [ "Churn","TechSupport"], as_index=False )["customerID"].count()
techsupport_attrition_df

Unnamed: 0,Churn,TechSupport,customerID
0,No,No,647
1,No,No internet service,448
2,No,Yes,559
3,Yes,No,480
4,Yes,No internet service,32
5,Yes,Yes,77


In [146]:
techsupport_base_df=df.groupby(["TechSupport"], as_index=False )["customerID"].count()
techsupport_base_df['Churn']='Customer Base'
techsupport_base_df

Unnamed: 0,TechSupport,customerID,Churn
0,No,1127,Customer Base
1,No internet service,480,Customer Base
2,Yes,636,Customer Base


In [147]:
techsupport_attrition_df=techsupport_attrition_df.append(techsupport_base_df, ignore_index = True) 
techsupport_attrition_df.columns=['Churn','TechSupport','Customers']
techsupport_attrition_df=techsupport_attrition_df.sort_values(by=['TechSupport', 'Customers'],ascending=True)
techsupport_attrition_df

Unnamed: 0,Churn,TechSupport,Customers
3,Yes,No,480
0,No,No,647
6,Customer Base,No,1127
4,Yes,No internet service,32
1,No,No internet service,448
7,Customer Base,No internet service,480
5,Yes,Yes,77
2,No,Yes,559
8,Customer Base,Yes,636


In [173]:
colors = ['crimson','skyblue','teal']
techsupport_barchart=px.bar(techsupport_attrition_df,x='TechSupport',y='Customers',color='Churn',text='Customers',color_discrete_sequence=colors,barmode="group",
                           title='Churn by Tech Support')
techsupport_barchart.update_layout(legend=dict(yanchor="top",y=0.95,xanchor="left",x=0.50),autosize=True,margin=dict(t=30,b=0,l=0,r=0)) #use barmode='stack' when stacking,

Attrition by Contract

In [149]:
contract_attrition_df=df.groupby( [ "Churn","Contract"], as_index=False )["customerID"].count()
contract_attrition_df.head()

Unnamed: 0,Churn,Contract,customerID
0,No,Month-to-month,702
1,No,One year,412
2,No,Two year,540
3,Yes,Month-to-month,531
4,Yes,One year,51


In [150]:
contract_base_df=df.groupby(["Contract"], as_index=False )["customerID"].count()
contract_base_df['Churn']='Customer Base'
contract_base_df

Unnamed: 0,Contract,customerID,Churn
0,Month-to-month,1233,Customer Base
1,One year,463,Customer Base
2,Two year,547,Customer Base


In [151]:
contract_attrition_df=contract_attrition_df.append(contract_base_df, ignore_index = True) 
contract_attrition_df.columns=['Churn','Contract','Customers']
contract_attrition_df=contract_attrition_df.sort_values(by=['Contract', 'Customers'],ascending=True)
contract_attrition_df

Unnamed: 0,Churn,Contract,Customers
3,Yes,Month-to-month,531
0,No,Month-to-month,702
6,Customer Base,Month-to-month,1233
4,Yes,One year,51
1,No,One year,412
7,Customer Base,One year,463
5,Yes,Two year,7
2,No,Two year,540
8,Customer Base,Two year,547


In [174]:
colors = ['crimson','skyblue','teal']
contract_barchart=px.bar(contract_attrition_df,x='Contract',y='Customers',color='Churn',text='Customers',color_discrete_sequence=colors,barmode="group",title='Churn by Contract')
contract_barchart.update_layout(legend=dict(yanchor="top",y=0.95,xanchor="left",x=0.50),autosize=True,margin=dict(t=30,b=0,l=0,r=0)) #use barmode='stack' when stacking,

Churn by Monthly Charges

In [153]:
churn_dist = df[df['Churn']=='Yes']['MonthlyCharges']
no_churn_dist = df[df['Churn']=='No']['MonthlyCharges']

group_labels = ['No Churn', 'Churn Customers']

colors = ['teal','crimson']

churn_dist_fig = ff.create_distplot([no_churn_dist,churn_dist], group_labels, bin_size=[1, .10],
                         curve_type='kde',  show_rug=False, colors=colors)# override default 'kde' or 'normal'
churn_dist_fig.update_layout(title={'text': 'Customer Churn Distribution by Monthly Charges','y':0.9,'x':0.5, 'xanchor': 'center','yanchor': 'top'},
                             legend=dict(yanchor="top",y=0.95,xanchor="left",x=0.50),autosize=True,margin=dict(t=50,b=0,l=0,r=0)) #use barmode='stack' when stacking,

Attrition by Citizenship

In [154]:
citizenship_attrition_df=df.groupby( [ "Churn","SeniorCitizen"], as_index=False )["customerID"].count()
citizenship_attrition_df

Unnamed: 0,Churn,SeniorCitizen,customerID
0,No,0.0,1441
1,No,1.0,213
2,Yes,0.0,449
3,Yes,1.0,140


In [155]:
citizenship_base_df=df.groupby(["SeniorCitizen"], as_index=False )["customerID"].count()
citizenship_base_df['Churn']='Customer Base'
citizenship_base_df

Unnamed: 0,SeniorCitizen,customerID,Churn
0,0.0,1890,Customer Base
1,1.0,353,Customer Base


In [156]:
citizenship_attrition_df=citizenship_attrition_df.append(citizenship_base_df, ignore_index = True) 
citizenship_attrition_df.columns=['Churn','Citizenship','Customers']
citizenship_attrition_df=citizenship_attrition_df.sort_values(by=['Citizenship', 'Customers'],ascending=False)
citizenship_attrition_df

Unnamed: 0,Churn,Citizenship,Customers
5,Customer Base,1.0,353
1,No,1.0,213
3,Yes,1.0,140
4,Customer Base,0.0,1890
0,No,0.0,1441
2,Yes,0.0,449


In [175]:
colors = ['teal','skyblue','crimson']
citizenship_barchart=px.bar(citizenship_attrition_df,x='Customers',y=['Citizenship'],color='Churn',text='Customers',orientation="h",color_discrete_sequence=colors,barmode="group",
                           title='Churn by Citizenship')
citizenship_barchart.update_layout(legend=dict(yanchor="top",y=0.95,xanchor="left",x=0.50),autosize=True,margin=dict(t=30,b=0,l=0,r=0))

Attrition by Tenure

In [158]:
tenure_attrition_df=df.groupby( [ "Churn","tenure"], as_index=False )["customerID"].count()
tenure_attrition_df.columns=['Churn','Tenure','Customers']
tenure_attrition_df.head()

Unnamed: 0,Churn,Tenure,Customers
0,No,0.0,5
1,No,1.0,79
2,No,2.0,33
3,No,3.0,41
4,No,4.0,22


In [176]:
colors = ['skyblue','crimson']
tenure_barchart = px.treemap(tenure_attrition_df, path=['Churn', 'Tenure'], values='Customers',color_discrete_sequence=colors,
                            title='Churn by Tenure')
tenure_barchart.update_layout(legend=dict(yanchor="top",y=0.95,xanchor="left",x=0.50),autosize=True,margin=dict(t=30,b=0,l=0,r=0)) 
tenure_barchart

## Data Preprocessing

Check for null values

In [160]:
df.isnull().sum()

customerID          0
gender              1
SeniorCitizen       1
Partner             1
Dependents          1
tenure              1
PhoneService        1
MultipleLines       1
InternetService     1
OnlineSecurity      1
OnlineBackup        1
DeviceProtection    1
TechSupport         1
StreamingTV         1
StreamingMovies     1
Contract            1
PaperlessBilling    1
PaymentMethod       1
MonthlyCharges      1
TotalCharges        6
Churn               1
dtype: int64

In [161]:
df['TotalCharges']=df['TotalCharges'].fillna(df['TotalCharges'].mean()) # Impute TotalCharges null values with mean TotalCharges

In [162]:
df=df.dropna() # Drop other null columns

Convert Predictor to binary

In [163]:
df['Churn'].replace(to_replace='Yes', value=1, inplace=True)
df['Churn'].replace(to_replace='No', value=0, inplace=True)

Dummy encode categorical features

In [164]:
df['SeniorCitizen'] = df['SeniorCitizen'].astype(str)  # convert SeniorCitizen column to string

In [165]:
data_columns=['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection', 'TechSupport', 'StreamingTV','StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod','SeniorCitizen']

In [166]:
df=pd.get_dummies(df,columns=data_columns)
df.head()

Unnamed: 0,customerID,tenure,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,SeniorCitizen_0.0,SeniorCitizen_1.0
0,7590-VHVEG,1.0,29.85,29.85,0,1,0,0,1,1,...,0,0,0,1,0,0,1,0,1,0
1,5575-GNVDE,34.0,56.95,1889.5,0,0,1,1,0,1,...,1,0,1,0,0,0,0,1,1,0
2,3668-QPYBK,2.0,53.85,108.15,1,0,1,1,0,1,...,0,0,0,1,0,0,0,1,1,0
3,7795-CFOCW,45.0,42.3,1840.75,0,0,1,1,0,1,...,1,0,1,0,1,0,0,0,1,0
4,9237-HQITU,2.0,70.7,151.65,1,1,0,1,0,1,...,0,0,0,1,0,0,1,0,1,0


Correlation of Churn with other feature

In [169]:
churn_corr_df=pd.DataFrame(df.corr()['Churn'])
churn_corr_df.reset_index(level=0, inplace=True)
churn_corr_df.columns=['Features','Correlation']
churn_corr_df["Correlation Type"] = np.where(churn_corr_df["Correlation"]<0, 'negative', 'positive')
churn_corr_df=churn_corr_df.sort_values(by=['Correlation'],ascending=False)
churn_corr_df=churn_corr_df[~churn_corr_df['Features'].isin(['Churn'])]
churn_corr_df.head()

Unnamed: 0,Features,Correlation,Correlation Type
36,Contract_Month-to-month,0.421983,positive
27,TechSupport_No,0.372957,positive
18,OnlineSecurity_No,0.350045,positive
16,InternetService_Fiber optic,0.327067,positive
43,PaymentMethod_Electronic check,0.32453,positive


In [170]:
colors = ['orange','skyblue']
churn_corr_barchart=px.bar(churn_corr_df,x='Features',y='Correlation',color='Correlation Type',text='Correlation',color_discrete_sequence=colors)
churn_corr_barchart.update_layout(legend=dict(yanchor="top",y=0.95,xanchor="left",x=0.50),autosize=True,margin=dict(t=0,b=0,l=0,r=0))

In [54]:
df.head()

Unnamed: 0,customerID,tenure,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,SeniorCitizen_0.0,SeniorCitizen_1.0
0,7590-VHVEG,1.0,29.85,29.85,0,1,0,0,1,1,...,0,0,0,1,0,0,1,0,1,0
1,5575-GNVDE,34.0,56.95,1889.5,0,0,1,1,0,1,...,1,0,1,0,0,0,0,1,1,0
2,3668-QPYBK,2.0,53.85,108.15,1,0,1,1,0,1,...,0,0,0,1,0,0,0,1,1,0
3,7795-CFOCW,45.0,42.3,1840.75,0,0,1,1,0,1,...,1,0,1,0,1,0,0,0,1,0
4,9237-HQITU,2.0,70.7,151.65,1,1,0,1,0,1,...,0,0,0,1,0,0,1,0,1,0


Feature Rescaling with Min-Max-Scaling

In [55]:
mms_columns=['tenure','MonthlyCharges','TotalCharges']
mms_df=pd.DataFrame(df,columns=mms_columns)
df=df.drop(columns=mms_columns)
rescaled_features=MinMaxScaler().fit_transform(mms_df)
rescaled_df=pd.DataFrame(rescaled_features,columns=mms_columns,index=df.index)
df=pd.concat([df,rescaled_df],axis=1)
df.head()

Unnamed: 0,customerID,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,...,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,SeniorCitizen_0.0,SeniorCitizen_1.0,tenure,MonthlyCharges,TotalCharges
0,7590-VHVEG,0,1,0,0,1,1,0,1,0,...,1,0,0,1,0,1,0,0.013889,0.114214,0.001293
1,5575-GNVDE,0,0,1,1,0,1,0,0,1,...,0,0,0,0,1,1,0,0.472222,0.384539,0.218899
2,3668-QPYBK,1,0,1,1,0,1,0,0,1,...,1,0,0,0,1,1,0,0.027778,0.353616,0.010455
3,7795-CFOCW,0,0,1,1,0,1,0,1,0,...,0,1,0,0,0,1,0,0.625,0.238404,0.213195
4,9237-HQITU,1,1,0,1,0,1,0,0,1,...,1,0,0,1,0,1,0,0.027778,0.521696,0.015545


Feature Importance with Random Forest

In [56]:
X=df.iloc[:,2:]
y=df['Churn']

In [57]:
important_features=RandomForestClassifier()
important_features.fit(X,y)

importances=important_features.feature_importances_
indices=np.argsort(importances)[::-1]
features = [X.columns[i] for i in indices]

feat_importance_df = pd.DataFrame(importances, index=features, columns=["Importance"])
feat_importance_df.reset_index(level=0, inplace=True)
feat_importance_df.columns=['Features','Importance']
feat_importance_df.to_csv('../datasets/feature-importance.csv') # save output to csv for faster retrieval


The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



In [58]:
feat_importance_df=pd.read_csv('../datasets/feature-importance.csv')
feat_importance_df=feat_importance_df.sort_values(by=['Importance'],ascending=False)
feat_importance_df.head()

Unnamed: 0.1,Unnamed: 0,Features,Importance
45,45,StreamingMovies_No internet service,0.147133
44,44,TechSupport_No internet service,0.119185
43,43,PhoneService_Yes,0.117918
32,32,PaymentMethod_Mailed check,0.098043
34,34,DeviceProtection_No internet service,0.041778


In [104]:
feat_importance_barchart=px.bar(feat_importance_df.head(15),x='Features',y='Importance',text='Importance',color='Importance',height=650,title='Random Feature Importance')
feat_importance_barchart.update_layout(legend=dict(yanchor="top",y=0.99,xanchor="left",x=0.01),autosize=True,margin=dict(t=30,b=0,l=0,r=0))

Reorder Columns alphabetically

In [60]:
X= X.sort_index(axis=1)

Split data into Training and Validation sets

In [61]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3, random_state=0)

In [62]:
X_train.shape, X_test.shape, y_train.shape,y_test.shape

((1570, 46), (673, 46), (1570,), (673,))

Handling Imbalanced Classes With Upsampling Using SMOTE

In [63]:
smote=SMOTE(random_state=5)

In [64]:
X_train_resampled,y_train_resampled=smote.fit_sample(X_train,y_train.ravel())

In [65]:
X_train_resampled.shape, y_train_resampled.shape

((2334, 46), (2334,))

## ML Modeling

Create Pipeline

In [66]:
logistic_reg_pipeline=Pipeline([('clf',LogisticRegression(random_state=2))])
random_forest_pipeline=Pipeline([('clf',RandomForestClassifier(random_state=2))])
svm_pipeline=Pipeline([('clf',SVC(random_state=2,probability=True))])

Set grid search params

In [67]:
param_range=[1,2,3,4,5,6,7,8,9,10]
param_range_float=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]

grid_params_lr = [{'clf__penalty': ['l1', 'l2'],
                   'clf__C': param_range_float,
                   'clf__solver': ['liblinear']}] 

grid_params_rf = [{'clf__criterion': ['gini', 'entropy'],
                   'clf__min_samples_leaf': param_range,
                   'clf__max_depth': param_range,
                   'clf__min_samples_split': param_range[1:]}]

grid_params_svm = [{'clf__kernel': ['linear', 'rbf'], 
                    'clf__C': param_range}]

Create GridSearch

In [68]:
jobs = -1

grid_search_lr = GridSearchCV(estimator=logistic_reg_pipeline,
                     param_grid=grid_params_lr,
                     scoring='accuracy',
                     cv=10) 
grid_search_rf = GridSearchCV(estimator=random_forest_pipeline,
                     param_grid=grid_params_rf,
                     scoring='accuracy',
                     cv=10, 
                     n_jobs=jobs) 
grid_search_svm = GridSearchCV(estimator=svm_pipeline,
                      param_grid=grid_params_svm,
                      scoring='accuracy',
                      cv=10,
                      n_jobs=jobs)

Organize pipeline

In [69]:
grids = [grid_search_lr, grid_search_rf, grid_search_svm]
grid_dict = {0: 'Logistic Regression',1: 'Random Forest', 2: 'Support Vector Machine'}

Fit the grid search pipeline

In [70]:
print('Optimizing models....')
best_accuracy = 0.0
best_estimator = 0
best_gridsearch = ''
model_metrics = {"Type":[],"Model":[],"Accuracy":[],"Precision":[],"Recall":[],"F_1_Score":[],"Confusion_Matrix_ROC":[],'AUC_Score':[]};
con_matrix={"Type":[],"Model":[],"Accuracy":[],"Precision":[],"Recall":[],"F_1_Score":[],"Confusion_Matrix_ROC":[],'AUC_Score':[]}
uac_roc_fpr={"FPR Model":[],"FPR":[],'AUC_Score':[]}
uac_roc_tpr={"TPR Model":[],"TPR":[],'AUC_Score':[]}

# Create a directory where to save our models
model_save_directory='Churn Models'
if not os.path.exists(model_save_directory):
    os.makedirs(model_save_directory)
    
for idx, gs in enumerate(grids):
    print('\nEstimator: %s' % grid_dict[idx])
    # Fit grid search
    gs.fit(X_train, y_train)
    # save model for later use
    model_pipeline=model_save_directory+'/'+grid_dict[idx]+'.pkl'
    joblib.dump(gs,model_pipeline , compress=1)
    print('\n%s grid search pipeline saved to file: %s' % (grid_dict[idx], model_pipeline))
    # Best params
    print('Best params: %s' % gs.best_params_)
    # Best training data accuracy
    print('Best training accuracy: %.3f' % gs.best_score_)
    # Predict on test data with best params
    y_pred = gs.predict(X_test)
    pred_prob = gs.predict_proba(X_test) # predict probabilities
    fpr, tpr, thresh = roc_curve(y_test, pred_prob[:,1], pos_label=1) # roc curve for models
    auc_score = roc_auc_score(y_test, pred_prob[:,1]) # auc score
    # Test data accuracy of model with best params
    # Evaluate models and store metrics in a dataframe
    print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_pred))
            # Append metrics ,accuracy_score(y_test, y_pred
    model_metrics["Type"].append('Metric')
    model_metrics["Model"].append(grid_dict[idx])
    model_metrics["Accuracy"].append(accuracy_score(y_test, y_pred)*100)
    model_metrics["Precision"].append(precision_score(y_test, y_pred)*100)
    model_metrics["Recall"].append(recall_score(y_test, y_pred)*100)
    model_metrics["F_1_Score"].append(f1_score(y_test, y_pred)*100)
    model_metrics["Confusion_Matrix_ROC"].append(np.nan)
    model_metrics["AUC_Score"].append(auc_score)
     
        # confusion matrix
    con_matrix["Type"].append('Confusion_Matrix')
    con_matrix["Model"].append(grid_dict[idx])  
    con_matrix["Accuracy"].append(np.nan)
    con_matrix["Precision"].append(np.nan)
    con_matrix["Recall"].append(np.nan)
    con_matrix["F_1_Score"].append(np.nan)
    con_matrix["AUC_Score"].append(np.nan)
    con_matrix["Confusion_Matrix_ROC"].append(confusion_matrix(y_test, y_pred))

    #UAC ROC
    uac_roc_fpr['FPR Model'].append(grid_dict[idx]) 
    uac_roc_fpr['FPR'].append(fpr)
    uac_roc_fpr['AUC_Score'].append(auc_score) 
    
    uac_roc_tpr['TPR Model'].append(grid_dict[idx]) 
    uac_roc_tpr['TPR'].append(tpr)
    uac_roc_tpr['AUC_Score'].append(auc_score) 
    # Track best (highest test accuracy) model
    if accuracy_score(y_test, y_pred) > best_accuracy:
        best_accuracy = accuracy_score(y_test, y_pred)
        best_gridsearch = gs
        best_estimator = idx
print('\nClassifier with best test set accuracy: %s' % grid_dict[best_estimator])

# print(model_metrics)
print(con_matrix)

Optimizing models....

Estimator: Logistic Regression

Logistic Regression grid search pipeline saved to file: Churn Models/Logistic Regression.pkl
Best params: {'clf__C': 0.1, 'clf__penalty': 'l1', 'clf__solver': 'liblinear'}
Best training accuracy: 0.813
Test set accuracy score for best params: 0.801 

Estimator: Random Forest



The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.




Random Forest grid search pipeline saved to file: Churn Models/Random Forest.pkl
Best params: {'clf__criterion': 'gini', 'clf__max_depth': 7, 'clf__min_samples_leaf': 8, 'clf__min_samples_split': 2}
Best training accuracy: 0.818
Test set accuracy score for best params: 0.798 

Estimator: Support Vector Machine

Support Vector Machine grid search pipeline saved to file: Churn Models/Support Vector Machine.pkl
Best params: {'clf__C': 6, 'clf__kernel': 'linear'}
Best training accuracy: 0.811
Test set accuracy score for best params: 0.793 

Classifier with best test set accuracy: Logistic Regression
{'Type': ['Confusion_Matrix', 'Confusion_Matrix', 'Confusion_Matrix'], 'Model': ['Logistic Regression', 'Random Forest', 'Support Vector Machine'], 'Accuracy': [nan, nan, nan], 'Precision': [nan, nan, nan], 'Recall': [nan, nan, nan], 'F_1_Score': [nan, nan, nan], 'Confusion_Matrix_ROC': [array([[444,  43],
       [ 91,  95]], dtype=int64), array([[445,  42],
       [ 94,  92]], dtype=int64), a

In [71]:
# Save best grid search pipeline to file
dump_file =model_save_directory+'/best_gridsearch_model_pipeline.pkl'
joblib.dump(best_gridsearch, dump_file, compress=1)
print('\nSaved %s grid search model pipeline to : %s' % (grid_dict[best_estimator], dump_file))


Saved Logistic Regression grid search model pipeline to : Churn Models/best_gridsearch_model_pipeline.pkl


## Evaluate Models

Compute Metrics

In [72]:
model_accuracy_df=pd.DataFrame(model_metrics,columns=['Type','Model','Accuracy','Precision','Recall','F_1_Score','Confusion_Matrix_ROC','AUC_Score'])
model_accuracy_df

Unnamed: 0,Type,Model,Accuracy,Precision,Recall,F_1_Score,Confusion_Matrix_ROC,AUC_Score
0,Metric,Logistic Regression,80.089153,68.84058,51.075269,58.641975,,0.846951
1,Metric,Random Forest,79.791976,68.656716,49.462366,57.5,,0.849098
2,Metric,Support Vector Machine,79.346211,65.16129,54.301075,59.237537,,0.832472


In [73]:
con_matrix_df=pd.DataFrame(con_matrix,columns=['Type','Model','Accuracy','Precision','Recall','F_1_Score','Confusion_Matrix_ROC','AUC_Score'])
con_matrix_df

Unnamed: 0,Type,Model,Accuracy,Precision,Recall,F_1_Score,Confusion_Matrix_ROC,AUC_Score
0,Confusion_Matrix,Logistic Regression,,,,,"[[444, 43], [91, 95]]",
1,Confusion_Matrix,Random Forest,,,,,"[[445, 42], [94, 92]]",
2,Confusion_Matrix,Support Vector Machine,,,,,"[[433, 54], [85, 101]]",


In [74]:
uac_roc_fpr_df=pd.DataFrame(uac_roc_fpr,columns=['FPR Model','FPR','AUC_Score'])
uac_roc_fpr_df['Model']=uac_roc_fpr_df['FPR Model']+' FPR'
uac_roc_fpr_df.columns=['FPR Model','Confusion_Matrix_ROC','AUC_Score','Model']
uac_roc_tpr_df=pd.DataFrame(uac_roc_tpr,columns=['TPR Model','TPR','AUC_Score'])
uac_roc_tpr_df['Model']=uac_roc_tpr_df['TPR Model']+' TPR'
uac_roc_tpr_df.columns=['TPR Model','Confusion_Matrix_ROC','AUC_Score','Model']

uac_roc_fpr_df[['Model','Confusion_Matrix_ROC','AUC_Score']]
uac_roc_tpr_df[['Model','Confusion_Matrix_ROC','AUC_Score']]
uac_roc_df=uac_roc_fpr_df[['Model','Confusion_Matrix_ROC','AUC_Score']].append(uac_roc_tpr_df[['Model','Confusion_Matrix_ROC','AUC_Score']], ignore_index = True)
uac_roc_df['Type']='ROC'
uac_roc_df[['Accuracy','Precision','Recall','F_1_Score']]=np.nan
uac_roc_df=uac_roc_df[['Type','Model','Accuracy','Precision','Recall','F_1_Score','Confusion_Matrix_ROC','AUC_Score']]
uac_roc_df

Unnamed: 0,Type,Model,Accuracy,Precision,Recall,F_1_Score,Confusion_Matrix_ROC,AUC_Score
0,ROC,Logistic Regression FPR,,,,,"[0.0, 0.0, 0.0, 0.002053388090349076, 0.002053...",0.846951
1,ROC,Random Forest FPR,,,,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.002053388090349076...",0.849098
2,ROC,Support Vector Machine FPR,,,,,"[0.0, 0.0, 0.0, 0.002053388090349076, 0.002053...",0.832472
3,ROC,Logistic Regression TPR,,,,,"[0.0, 0.005376344086021506, 0.0376344086021505...",0.846951
4,ROC,Random Forest TPR,,,,,"[0.0, 0.005376344086021506, 0.0161290322580645...",0.849098
5,ROC,Support Vector Machine TPR,,,,,"[0.0, 0.005376344086021506, 0.0376344086021505...",0.832472


In [75]:
model_evaluation_df=model_accuracy_df.append(con_matrix_df, ignore_index = True) 

In [76]:
model_evaluation_df=model_evaluation_df.append(uac_roc_df, ignore_index = True) 

In [77]:
model_evaluation_df['AUC_Score']=model_evaluation_df['AUC_Score']*100
model_evaluation_df

Unnamed: 0,Type,Model,Accuracy,Precision,Recall,F_1_Score,Confusion_Matrix_ROC,AUC_Score
0,Metric,Logistic Regression,80.089153,68.84058,51.075269,58.641975,,84.695083
1,Metric,Random Forest,79.791976,68.656716,49.462366,57.5,,84.909805
2,Metric,Support Vector Machine,79.346211,65.16129,54.301075,59.237537,,83.247224
3,Confusion_Matrix,Logistic Regression,,,,,"[[444, 43], [91, 95]]",
4,Confusion_Matrix,Random Forest,,,,,"[[445, 42], [94, 92]]",
5,Confusion_Matrix,Support Vector Machine,,,,,"[[433, 54], [85, 101]]",
6,ROC,Logistic Regression FPR,,,,,"[0.0, 0.0, 0.0, 0.002053388090349076, 0.002053...",84.695083
7,ROC,Random Forest FPR,,,,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.002053388090349076...",84.909805
8,ROC,Support Vector Machine FPR,,,,,"[0.0, 0.0, 0.0, 0.002053388090349076, 0.002053...",83.247224
9,ROC,Logistic Regression TPR,,,,,"[0.0, 0.005376344086021506, 0.0376344086021505...",84.695083


Save Model Evaluation data to a json

In [78]:
model_evaluation_df.to_json(model_save_directory+'/model_metrics.json', orient = 'split', compression = 'infer', index = 'true') 

Load the json data

In [79]:
# readind the JSON file 
model_evaluation_df = pd.read_json(model_save_directory+'/model_metrics.json', orient ='split', compression = 'infer') 
  
# displaying the DataFrame 
model_evaluation_df

Unnamed: 0,Type,Model,Accuracy,Precision,Recall,F_1_Score,Confusion_Matrix_ROC,AUC_Score
0,Metric,Logistic Regression,80.089153,68.84058,51.075269,58.641975,,84.695083
1,Metric,Random Forest,79.791976,68.656716,49.462366,57.5,,84.909805
2,Metric,Support Vector Machine,79.346211,65.16129,54.301075,59.237537,,83.247224
3,Confusion_Matrix,Logistic Regression,,,,,"[[444, 43], [91, 95]]",
4,Confusion_Matrix,Random Forest,,,,,"[[445, 42], [94, 92]]",
5,Confusion_Matrix,Support Vector Machine,,,,,"[[433, 54], [85, 101]]",
6,ROC,Logistic Regression FPR,,,,,"[0.0, 0.0, 0.0, 0.0020533881, 0.0020533881, 0....",84.695083
7,ROC,Random Forest FPR,,,,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0020533881, 0.0020...",84.909805
8,ROC,Support Vector Machine FPR,,,,,"[0.0, 0.0, 0.0, 0.0020533881, 0.0020533881, 0....",83.247224
9,ROC,Logistic Regression TPR,,,,,"[0.0, 0.0053763441, 0.0376344086, 0.0430107527...",84.695083


In [80]:
unpivoted_metric_df=model_evaluation_df[model_evaluation_df['Type']=='Metric'][['Model','Accuracy','Precision','Recall','F_1_Score','AUC_Score']]
unpivoted_metric_df=unpivoted_metric_df.melt(id_vars=['Model'], var_name='Metrics', value_name='Score').sort_values(by=['Score'],ascending=True)
unpivoted_metric_df

Unnamed: 0,Model,Metrics,Score
7,Random Forest,Recall,49.462366
6,Logistic Regression,Recall,51.075269
8,Support Vector Machine,Recall,54.301075
10,Random Forest,F_1_Score,57.5
9,Logistic Regression,F_1_Score,58.641975
11,Support Vector Machine,F_1_Score,59.237537
5,Support Vector Machine,Precision,65.16129
4,Random Forest,Precision,68.656716
3,Logistic Regression,Precision,68.84058
2,Support Vector Machine,Accuracy,79.346211


In [105]:
colors = ['crimson','skyblue','teal','orange']
model_accuracy_barchart=px.bar(unpivoted_metric_df,x='Metrics',y='Score',color='Model',text='Score',color_discrete_sequence=colors,barmode="group",title='Model Perforance Metrics')
model_accuracy_barchart.update_layout(legend=dict(yanchor="top",y=0.95,xanchor="left",x=0.01),autosize=True,margin=dict(t=30,b=0,l=0,r=0)) #use barmode='stack' when stacking,

Confusion Matrix

In [82]:
con_matrix_df=model_evaluation_df[model_evaluation_df['Type']=='Confusion_Matrix'][['Model','Confusion_Matrix_ROC']]
con_matrix_df.reset_index(level=0, inplace=True)
con_matrix_df

Unnamed: 0,index,Model,Confusion_Matrix_ROC
0,3,Logistic Regression,"[[444, 43], [91, 95]]"
1,4,Random Forest,"[[445, 42], [94, 92]]"
2,5,Support Vector Machine,"[[433, 54], [85, 101]]"


In [83]:
random_f_z=con_matrix_df['Confusion_Matrix_ROC'][1]
random_f_z= random_f_z[::-1]
x=['TP','FP']

y =  x[::-1].copy()
# change each element of z to type string for annotations
random_f_z_text = [[str(y) for y in x] for x in random_f_z]
colorscale = [[0, 'orange'], [1, 'teal']]
font_colors = ['white', 'black']

fig = ff.create_annotated_heatmap(random_f_z,x=x, y=y, annotation_text=random_f_z_text,  hoverinfo='z',colorscale=colorscale)
fig.update_layout(title_text='Random Forest',autosize=True,margin=dict(t=30,b=0,l=0,r=0))

In [84]:
logistic_z=con_matrix_df['Confusion_Matrix_ROC'][0]
logistic_z= logistic_z[::-1]
x=['TP','FP']

y =  x[::-1].copy()
# change each element of z to type string for annotations
logistic_z_text = [[str(y) for y in x] for x in logistic_z]
colorscale = [[0, 'skyblue'], [1, 'green']]

fig = ff.create_annotated_heatmap(logistic_z,x=x, y=y, annotation_text=logistic_z_text,  hoverinfo='z',colorscale=colorscale)
fig.update_layout(title_text='Logistic Regression',autosize=True,margin=dict(t=30,b=0,l=0,r=0))

In [85]:
svm_z=con_matrix_df['Confusion_Matrix_ROC'][2]
svm_z= svm_z[::-1]
x=['TP','FP']

y =  x[::-1].copy()
# change each element of z to type string for annotations
svm_z_text = [[str(y) for y in x] for x in svm_z]
colorscale = [[0, 'crimson'], [1, 'green']]

fig = ff.create_annotated_heatmap(svm_z,x=x, y=y, annotation_text=svm_z_text,  hoverinfo='z',colorscale='rainbow')
fig.update_layout(title_text='Support Vector Machine',autosize=True,margin=dict(t=30,b=0,l=0,r=0))

UAC ROC

In [86]:
uac_roc_df=model_evaluation_df[model_evaluation_df['Type']=='ROC'][['Model','Confusion_Matrix_ROC']]

In [87]:
uac_roc_df=uac_roc_df.sort_values(by=['Model'],ascending=True)
uac_roc_df=uac_roc_df.set_index('Model').transpose()
uac_roc_df

Model,Logistic Regression FPR,Logistic Regression TPR,Random Forest FPR,Random Forest TPR,Support Vector Machine FPR,Support Vector Machine TPR
Confusion_Matrix_ROC,"[0.0, 0.0, 0.0, 0.0020533881, 0.0020533881, 0....","[0.0, 0.0053763441, 0.0376344086, 0.0430107527...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0020533881, 0.0020...","[0.0, 0.0053763441, 0.016129032300000002, 0.02...","[0.0, 0.0, 0.0, 0.0020533881, 0.0020533881, 0....","[0.0, 0.0053763441, 0.0376344086, 0.0376344086..."


In [106]:
uac_roc_fig = go.Figure()
uac_roc_fig.add_trace(go.Scatter(x=uac_roc_df['Logistic Regression FPR'][0], y=uac_roc_df['Logistic Regression TPR'][0],name='Logistic Regression',
                                line = dict(color='teal', width=2),line_shape='spline'))
uac_roc_fig.add_trace(go.Scatter(x=uac_roc_df['Random Forest FPR'][0], y=uac_roc_df['Random Forest TPR'][0],name='Random Forest',
                                line = dict(color='royalblue', width=2),line_shape='spline'))
uac_roc_fig.add_trace(go.Scatter(x=uac_roc_df['Support Vector Machine FPR'][0], y=uac_roc_df['Support Vector Machine TPR'][0],name='Support Vector Machine',
                                line = dict(color='orange', width=2),line_shape='spline'))
uac_roc_fig.add_trace(go.Scatter(x=np.array([0., 1.]), y=np.array([0., 1.]),name='Random Gues',
                                line = dict(color='firebrick', width=4, dash='dash')))
uac_roc_fig.update_layout(title={'text': 'AUC-ROC Model Evaluation','y':0.9,'x':0.5, 'xanchor': 'center','yanchor': 'top'},
                          legend=dict(yanchor="bottom",y=0.05,xanchor="right",x=0.95),autosize=True,margin=dict(t=70,b=0,l=0,r=0))
uac_roc_fig.show()

## Model Prediction

In [89]:
X.columns

Index(['Contract_Month-to-month', 'Contract_One year', 'Contract_Two year',
       'Dependents_No', 'Dependents_Yes', 'DeviceProtection_No',
       'DeviceProtection_No internet service', 'DeviceProtection_Yes',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'MonthlyCharges', 'MultipleLines_No',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'OnlineBackup_No', 'OnlineBackup_No internet service',
       'OnlineBackup_Yes', 'OnlineSecurity_No',
       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
       'PaperlessBilling_No', 'PaperlessBilling_Yes', 'Partner_No',
       'Partner_Yes', 'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check',
       'PhoneService_No', 'PhoneService_Yes', 'SeniorCitizen_0.0',
       'SeniorCitizen_1.0', 'StreamingMovies_No',
       'StreamingMovies_No internet service', 'Strea

In [90]:
df=pd.read_csv("../datasets/telco-customer-churn.csv")
df_copy=df
df_copy=df_copy.drop(columns=['Churn'])
df_copy['TotalCharges']=pd.to_numeric(df_copy['TotalCharges'], errors='coerce')
pred_data=pd.read_csv('../datasets/telco_pred_data.csv',index_col=['ID'])
pred_data=pred_data.drop(columns=['Unnamed: 0'])
df_copy.set_index("customerID", inplace = True)
pred_df=df_copy.append(pred_data)
pred_df.tail()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
3398-GCPMU,Female,1.0,Yes,Yes,72.0,Yes,Yes,DSL,Yes,Yes,Yes,Yes,Yes,Yes,Two year,Yes,Credit card (automatic),89.55,6448.85
2908-WGAXL,Female,0.0,Yes,Yes,56.0,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,Yes,Credit card (automatic),24.95,1468.9
3378-AJRAO,Male,0.0,Yes,Yes,44.0,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,No,Electronic check,24.85,1013.6
1013-QCW,,,,,,,,,,,,,,,,,,,
1,Female,1.0,Yes,No,4.0,No,Yes,Fiber optic,Yes,No,Yes,No internet service,Yes,No internet service,Two year,No,Electronic check,70.0,300.0


In [91]:
pred_df_columns=['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection', 'TechSupport', 'StreamingTV','StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod','SeniorCitizen']
pred_df=pd.get_dummies(pred_df,columns=pred_df_columns)
pred_df.shape

(2245, 46)

In [92]:
pred_df.tail()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,SeniorCitizen_0.0,SeniorCitizen_1.0
3398-GCPMU,72.0,89.55,6448.85,1,0,0,1,0,1,0,...,0,1,0,1,0,1,0,0,0,1
2908-WGAXL,56.0,24.95,1468.9,1,0,0,1,0,1,0,...,0,1,0,1,0,1,0,0,1,0
3378-AJRAO,44.0,24.85,1013.6,0,1,0,1,0,1,0,...,1,0,1,0,0,0,1,0,1,0
1013-QCW,,,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4.0,70.0,300.0,1,0,0,1,1,0,1,...,0,1,1,0,0,0,1,0,0,1


In [93]:
pred_mms_columns=['tenure','MonthlyCharges','TotalCharges']
pred_mms_df=pd.DataFrame(pred_df,columns=pred_mms_columns)
pred_df=pred_df.drop(columns=pred_mms_columns)

In [94]:
pred_rescaled_features=MinMaxScaler().fit_transform(pred_mms_df)
pred_rescaled_df=pd.DataFrame(pred_rescaled_features,columns=pred_mms_columns,index=pred_df.index)
pred_df=pd.concat([pred_df,pred_rescaled_df],axis=1)
pred_df.head()

Unnamed: 0,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_No phone service,...,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,SeniorCitizen_0.0,SeniorCitizen_1.0,tenure,MonthlyCharges,TotalCharges
7590-VHVEG,1,0,0,1,1,0,1,0,0,1,...,1,0,0,1,0,1,0,0.013889,0.114214,0.001293
5575-GNVDE,0,1,1,0,1,0,0,1,1,0,...,0,0,0,0,1,1,0,0.472222,0.384539,0.218899
3668-QPYBK,0,1,1,0,1,0,0,1,1,0,...,1,0,0,0,1,1,0,0.027778,0.353616,0.010455
7795-CFOCW,0,1,1,0,1,0,1,0,0,1,...,0,1,0,0,0,1,0,0.625,0.238404,0.213195
9237-HQITU,1,0,1,0,1,0,0,1,1,0,...,1,0,0,1,0,1,0,0.027778,0.521696,0.015545


In [95]:
pred_df= pred_df.sort_index(axis=1)
pred_df.head()

Unnamed: 0,Contract_Month-to-month,Contract_One year,Contract_Two year,Dependents_No,Dependents_Yes,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,InternetService_DSL,InternetService_Fiber optic,...,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,TotalCharges,gender_Female,gender_Male,tenure
7590-VHVEG,1,0,0,1,0,1,0,0,1,0,...,1,0,0,1,0,0,0.001293,1,0,0.013889
5575-GNVDE,0,1,0,1,0,0,0,1,1,0,...,1,0,0,1,0,0,0.218899,0,1,0.472222
3668-QPYBK,1,0,0,1,0,1,0,0,1,0,...,1,0,0,1,0,0,0.010455,0,1,0.027778
7795-CFOCW,0,1,0,1,0,0,0,1,1,0,...,1,0,0,0,0,1,0.213195,0,1,0.625
9237-HQITU,1,0,0,1,0,1,0,0,0,1,...,1,0,0,1,0,0,0.015545,1,0,0.027778


In [96]:
# Load from file
joblib_model = joblib.load(dump_file)

predict_probability=joblib_model.predict_proba(pred_df.head(1))
print(predict_probability[:,0])
print(predict_probability[:,1])
y_predict = joblib_model.predict(pred_df.head(1))
y_predict[0]

[0.42360931]
[0.57639069]


1

In [97]:
# Load from file
joblib_model = joblib.load(dump_file)

predict_probability=joblib_model.predict_proba(pred_df.head(5))
print(predict_probability[:,0])
print(predict_probability[:,1])
y_predict = joblib_model.predict(pred_df.head(5))
results_df = pd.DataFrame({'No Probability':predict_probability[:,0], 'Yes Probability':predict_probability[:,1],'Prediction':y_predict})
df[['No Probability','Yes Probability','Prediction']]=results_df
df['Prediction'].replace(to_replace=1.0, value='Yes', inplace=True)
df['Prediction'].replace(to_replace=0.0, value='No', inplace=True)
pred_confidence=[]
for index, row in df.iterrows():
    if row['Prediction']=='Yes':
        pred_confidence.append(row['Yes Probability']*100)
    else:
        pred_confidence.append(row['No Probability']*100)
df['Prediction Confidence']=pred_confidence

[0.42360931 0.92130368 0.67179782 0.93804486 0.30842329]
[0.57639069 0.07869632 0.32820218 0.06195514 0.69157671]


In [98]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,No Probability,Yes Probability,Prediction,Prediction Confidence
0,7590-VHVEG,Female,0.0,Yes,No,1.0,No,No phone service,DSL,No,...,Month-to-month,Yes,Electronic check,29.85,29.85,No,0.423609,0.576391,Yes,57.639069
1,5575-GNVDE,Male,0.0,No,No,34.0,Yes,No,DSL,Yes,...,One year,No,Mailed check,56.95,1889.5,No,0.921304,0.078696,No,92.130368
2,3668-QPYBK,Male,0.0,No,No,2.0,Yes,No,DSL,Yes,...,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,0.671798,0.328202,No,67.179782
3,7795-CFOCW,Male,0.0,No,No,45.0,No,No phone service,DSL,Yes,...,One year,No,Bank transfer (automatic),42.3,1840.75,No,0.938045,0.061955,No,93.804486
4,9237-HQITU,Female,0.0,No,No,2.0,Yes,No,Fiber optic,No,...,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,0.308423,0.691577,Yes,69.157671
