# Modeling Churn predictions for Telcom customers

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.style as style
import seaborn as sns
import os
style.use('fivethirtyeight')

In [45]:
df = pd.read_csv('data/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.shape

(7043, 21)

## Data processing

In [46]:
## Total charges
# Replacing spaces with null values in total charges column
df['TotalCharges'] = df["TotalCharges"].replace(" ",np.nan)

# Dropping null values from total charges column which contain missing data 
df = df[df["TotalCharges"].notnull()]
df = df.reset_index()[df.columns]

# convert to float type
df["TotalCharges"] = df["TotalCharges"].astype(float)

## replace 'No internet service' to No for the following columns
replace_cols = [ 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                'TechSupport','StreamingTV', 'StreamingMovies']
for i in replace_cols : 
    df[i]  = df[i].replace({'No internet service' : 'No'})
    
## Senior citizens
df["SeniorCitizen"] = df["SeniorCitizen"].replace({1:"Yes",0:"No"})

## Change Churn to Yes or No. 
churn     = df[df["Churn"] == "Yes"]
not_churn = df[df["Churn"] == "No"]

df.shape

(7032, 21)

## Feature processing and Normalization

Before model formulation lets convert the Catagorical columns into numeric and some other conversations

In [48]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

#customer id col and target
Id_col     = ['customerID']
target_col = ["Churn"]

## Types of feats. # Segregate the feature types into Numeric, Binayr and Catagorical. 

#categorical columns
cat_cols   = df.nunique()[df.nunique() < 6].keys().tolist()
cat_cols   = [x for x in cat_cols if x not in target_col]
#numerical columns
num_cols   = [x for x in df.columns if x not in cat_cols + target_col + Id_col]
#Binary columns
bin_cols   = df.nunique()[df.nunique() == 2].keys().tolist()
#Columns more than 2 values
multi_cols = [i for i in cat_cols if i not in bin_cols]

# Label encoding Binary columns
le = LabelEncoder()
for i in bin_cols :
    df[i] = le.fit_transform(df[i])
    
# Duplicating columns for multi value columns
df = pd.get_dummies(data = df, columns = multi_cols)

# Scaling Numerical columns
std = StandardScaler()
scaled = std.fit_transform(df[num_cols])
scaled = pd.DataFrame(scaled,columns=num_cols)

# Dropping original values merging scaled values for numerical columns
df_telcom_og = df.copy()
df = df.drop(columns = num_cols,axis = 1)
df = df.merge(scaled,left_index=True,right_index=True,how = "left")

In [49]:
df.shape

(7032, 30)

In [50]:
print('All numeric features: \n', num_cols)
print('Cols with Binary fetures: \n', bin_cols)
print('Cols with multiple fetures: \n', multi_cols)

All numeric features: 
 ['tenure', 'MonthlyCharges', 'TotalCharges']
Cols with Binary fetures: 
 ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'Churn']
Cols with multiple fetures: 
 ['MultipleLines', 'InternetService', 'Contract', 'PaymentMethod']


In [51]:
len(num_cols) + len(bin_cols) + len(multi_cols)

20

## Model formulation

In [9]:
# Import all necessary modules 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import roc_auc_score,roc_curve,scorer
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score,recall_score

In [23]:
cols  = [i for i in df.columns if i not in Id_col + target_col]
len(cols)

28

In [52]:
# splitting train and test data 
train, test = train_test_split(df, test_size = .25 ,random_state = 666)
    
# Set training and traget features
train_X = train[cols]
train_Y = train[target_col]

test_X  = test[cols]
test_Y  = test[target_col]

In [53]:
print(train_X.shape)
print(train_Y.shape)
print(test_X.shape)
print(test_Y.shape)

(5274, 28)
(5274, 1)
(1758, 28)
(1758, 1)


In [55]:
# Lets fit a Logistic model to this. 

# 'liblinear' handles L1 penalty. 
logit = LogisticRegression(intercept_scaling=1, max_iter=100, n_jobs=1, penalty='l1', 
                           random_state=None, solver='liblinear') 

In [56]:
logit.fit(train_X, train_Y)
preds   = logit.predict(test_X)
probs = logit.predict_proba(test_X)

  y = column_or_1d(y, warn=True)


In [57]:
probs

array([[0.64982169, 0.35017831],
       [0.72976819, 0.27023181],
       [0.93572063, 0.06427937],
       ...,
       [0.91378859, 0.08621141],
       [0.48005385, 0.51994615],
       [0.89034471, 0.10965529]])

In [58]:
# print results

print("Classification report : \n",classification_report(test_Y,preds))
print ("Accuracy : ", accuracy_score(test_Y, preds))

#confusion matrix
cm = confusion_matrix(test_Y, preds)

# roc_auc_score
model_roc_auc = roc_auc_score(test_Y, preds) 

print ("Area under curve : ",model_roc_auc,"\n")
fpr,tpr,thresholds = roc_curve(test_Y, probs[:,1])

Classification report : 
               precision    recall  f1-score   support

           0       0.83      0.91      0.87      1303
           1       0.66      0.48      0.56       455

    accuracy                           0.80      1758
   macro avg       0.75      0.70      0.71      1758
weighted avg       0.79      0.80      0.79      1758

Accuracy :  0.800910125142207
Area under curve :  0.6969141372825179 

