In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np 
import pandas as pd 
import plotly.express as px
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix
from imblearn.under_sampling import NearMiss
from collections import Counter

from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

pd.set_option('display.max_columns', None)

In [2]:
cus_raw = pd.read_csv("../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")
cus_df = cus_raw.dropna()
cus_df.head(5)

print('There are a total of ' + str(cus_df.shape[0]) + ' observations in the dataset.')

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


There are a total of 7043 observations in the dataset.


In [3]:
cus_df.dtypes
cus_df['TotalCharges'] = pd.to_numeric(cus_df['TotalCharges'],errors = 'coerce')

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [4]:
cus_df.nunique()

cus_df.describe()

customerID          7043
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
TotalCharges        6530
Churn                  2
dtype: int64

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
count,7043.0,7043.0,7043.0,7032.0
mean,0.162147,32.371149,64.761692,2283.300441
std,0.368612,24.559481,30.090047,2266.771362
min,0.0,0.0,18.25,18.8
25%,0.0,9.0,35.5,401.45
50%,0.0,29.0,70.35,1397.475
75%,0.0,55.0,89.85,3794.7375
max,1.0,72.0,118.75,8684.8


In [None]:
churn=cus_df['Churn'].value_counts()
fig = px.pie(cus_df, values=churn.values, names=churn.index,color_discrete_sequence=px.colors.sequential.RdBu,
            title='Churn Breakdown')
fig.show()

In [None]:
fig = px.histogram(cus_df, x="tenure",title="Tenure Breakdown", color_discrete_sequence=['indianred'] )
fig.show()

fig = px.histogram(cus_df, x="MultipleLines",title="Lines Breakdown", color_discrete_sequence=['indianred'] )
fig.show()

fig = px.histogram(cus_df, x="gender",title="Gender Breakdown" ,color_discrete_sequence=['indianred'] )
fig.show()

fig = px.histogram(cus_df, x="Partner",title="Partner Breakdown" ,color_discrete_sequence=['indianred'] )
fig.show()

In [None]:
fig = px.scatter(x=cus_df['tenure'], y=cus_df['TotalCharges'], 
                 color = cus_df['Churn'],color_discrete_sequence=px.colors.qualitative.Antique, template = 'presentation', 
                 opacity = 0.7, facet_col = cus_df['Contract'], 
                 title = 'Customer Churn by Contract',
                 labels = {'x' : 'Tenure', 'y' : 'Total Charges'})
fig.show()

fig = px.scatter(x=cus_df['tenure'], y=cus_df['TotalCharges'], 
                 color = cus_df['Churn'],color_discrete_sequence=px.colors.qualitative.Antique, template = 'presentation', 
                 opacity = 0.7, facet_col = cus_df['Partner'], 
                 title = 'Customer Churn by Partner',
                 labels = {'x' : 'Tenure', 'y' : 'Total Charges'})
fig.show()

## Data Preparation

In [8]:
cus_df.head(2)
for col in cus_df:
    print(str(col) + '' +  str(cus_df[col].unique()))

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


customerID['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK']
gender['Female' 'Male']
SeniorCitizen[0 1]
Partner['Yes' 'No']
Dependents['No' 'Yes']
tenure[ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
PhoneService['No' 'Yes']
MultipleLines['No phone service' 'No' 'Yes']
InternetService['DSL' 'Fiber optic' 'No']
OnlineSecurity['No' 'Yes' 'No internet service']
OnlineBackup['Yes' 'No' 'No internet service']
DeviceProtection['No' 'Yes' 'No internet service']
TechSupport['No' 'Yes' 'No internet service']
StreamingTV['No' 'Yes' 'No internet service']
StreamingMovies['No' 'Yes' 'No internet service']
Contract['Month-to-month' 'One year' 'Two year']
PaperlessBilling['Yes' 'No']
PaymentMethod['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
Monthl

In [9]:
cusohe_df =pd.get_dummies(data=cus_df,columns=['gender', 'Partner', 'Dependents', 
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod'],drop_first=True)
cusohe_df

cusohe_df['Churn'] = cusohe_df['Churn'].replace("No", 0).replace("Yes", 1)

Unnamed: 0,customerID,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,7590-VHVEG,0,1,29.85,29.85,No,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0
1,5575-GNVDE,0,34,56.95,1889.50,No,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1
2,3668-QPYBK,0,2,53.85,108.15,Yes,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1
3,7795-CFOCW,0,45,42.30,1840.75,No,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0
4,9237-HQITU,0,2,70.70,151.65,Yes,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,0,24,84.80,1990.50,No,1,1,1,1,0,1,0,0,0,1,0,0,0,1,0,1,0,1,0,1,1,0,1,0,0,1
7039,2234-XADUH,0,72,103.20,7362.90,No,0,1,1,1,0,1,1,0,0,0,0,1,0,1,0,0,0,1,0,1,1,0,1,1,0,0
7040,4801-JZAZL,0,11,29.60,346.45,No,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
7041,8361-LTMKD,1,4,74.40,306.60,Yes,1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1


In [10]:
cusohe_df.isnull().values.any()

cusohe_df['TotalCharges'].isnull().values.any()

True

True

**Impute missing values in TotalCharges with mean**

In [11]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
cusohe_df.TotalCharges = imputer.fit_transform(cusohe_df["TotalCharges"].values.reshape(-1, 1))

## **Dealing with Class Imbalance with SMOTE**

Shape of X before SMOTE: (7043, 30)
Shape of X after SMOTE: (10348, 30)

Balance of positive and negative classes (%):


0    50.0
1    50.0
Name: Churn, dtype: float64

**Random Forest Classifier**

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X_sm, y_sm, test_size=0.25, random_state=42
)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
preds = model.predict(X_test)

print(f'Accuracy = {accuracy_score(y_test, preds):.2f}\nRecall = {recall_score(y_test, preds):.2f}\n')
cm = confusion_matrix(y_test, preds)

RandomForestClassifier(random_state=42)

Accuracy = 0.86
Recall = 0.85



**XGBOOST Classifier**

In [14]:
from xgboost import XGBClassifier
xgb = XGBClassifier(booster = 'gblinear', learning_rate = 1, n_estimators = 10)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

print(accuracy_score(y_test, y_pred))







XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, gamma=None, gpu_id=-1,
              importance_type=None, interaction_constraints=None,
              learning_rate=1, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=10, n_jobs=4, num_parallel_tree=None, predictor=None,
              random_state=0, reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
              subsample=None, tree_method=None, validate_parameters=1,
              verbosity=None)

0.818708929261693


## **Dealing with Class Imbalance with SMOTE + ENN**

In [15]:
from imblearn.combine import SMOTEENN
smenn = SMOTEENN()
X_train_smenn, y_train_smenn = smenn.fit_resample(X,Y)

print('Original dataset shape:', Counter(Y))
print('Resample dataset shape:', Counter(y_train_smenn))

Original dataset shape: Counter({0: 5174, 1: 1869})
Resample dataset shape: Counter({1: 3182, 0: 2686})


**Random Forest Classifier**

In [16]:
model.fit(X_train_smenn, y_train_smenn)
preds = model.predict(X_test)

print(f'Accuracy = {accuracy_score(y_test, preds):.2f}\nRecall = {recall_score(y_test, preds):.2f}\n')
cm = confusion_matrix(y_test, preds)

RandomForestClassifier(random_state=42)

Accuracy = 0.86
Recall = 0.90



**XGBoost Classifier**

In [17]:
xgb.fit(X_train_smenn, y_train_smenn)
y_pred = xgb.predict(X_test)
print(accuracy_score(y_test, y_pred))







XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, gamma=None, gpu_id=-1,
              importance_type=None, interaction_constraints=None,
              learning_rate=1, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=10, n_jobs=4, num_parallel_tree=None, predictor=None,
              random_state=0, reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
              subsample=None, tree_method=None, validate_parameters=1,
              verbosity=None)

0.8117510630073445


## **Dealing with Class Imbalance with NearMiss**

In [18]:
nm = NearMiss()
X_nm, Y_nm = nm.fit_resample(X, Y)

print('Original dataset shape:', Counter(Y))
print('Resample dataset shape:', Counter(Y_nm))

Original dataset shape: Counter({0: 5174, 1: 1869})
Resample dataset shape: Counter({0: 1869, 1: 1869})


**Random Forest Classifier**

In [19]:
model.fit(X_nm, Y_nm)
preds = model.predict(X_test)

print(f'Accuracy = {accuracy_score(y_test, preds):.2f}\nRecall = {recall_score(y_test, preds):.2f}\n')
cm = confusion_matrix(y_test, preds)

RandomForestClassifier(random_state=42)

Accuracy = 0.75
Recall = 0.87



**XGBoost Classifier**

In [20]:
xgb.fit(X_nm, Y_nm)
y_pred = xgb.predict(X_test)
print(accuracy_score(y_test, y_pred))







XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, gamma=None, gpu_id=-1,
              importance_type=None, interaction_constraints=None,
              learning_rate=1, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=10, n_jobs=4, num_parallel_tree=None, predictor=None,
              random_state=0, reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
              subsample=None, tree_method=None, validate_parameters=1,
              verbosity=None)

0.7630459992269037
