In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('Telco-Customer-Churn.csv')

In [3]:
len(df)

7043

In [4]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [6]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

TotalCharges isn't correctly identified as a numeric type.
we can force thi scolumn to be numeric by converting it to numbers using a special function in pandas: to_numeric. By default, thi sfunction raises an exception when it sees nonnumeric data (such as spaces), but we can make it skip these cases by specifying the errors='coerce' option. Thi sway pandas will replace all nonnumeric values iwht NaN

then we will set the missing values to zero

In [7]:
total_charges = pd.to_numeric(df.TotalCharges, errors='coerce')
df[total_charges.isnull()][['customerID', 'TotalCharges']]
df.TotalCharges = df.TotalCharges.fillna(0)

The column names don't follow ther same naming convention. Some of them start with a lower letter, whereas others start with a capital letter, and there are also spaces in the vlaues. 

we will make it uniform by lowercasing everything and replacing spaces with underscores. 

In [8]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

Target variable is churn.

its categorical with two values 'yes' and 'no'. we need to convert these values for binary classification.

In [9]:
df.churn = (df.churn == 'yes').astype(int)

In [10]:
from sklearn.model_selection import train_test_split

the function train_test_split takes a dataframe df and creates two new dataframes: df_train_full and df_test

In [11]:
df_train_full, df_test = train_test_split(df,test_size=0.2, 
                                          random_state=1)

In [12]:
df_train_full.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
1814,5442-pptjy,male,0,yes,yes,12,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.7,258.35,0
5946,6261-rcvns,female,0,no,no,42,yes,no,dsl,yes,...,yes,yes,no,yes,one_year,no,credit_card_(automatic),73.9,3160.55,1
3881,2176-osjuv,male,0,yes,no,71,yes,yes,dsl,yes,...,no,yes,no,no,two_year,no,bank_transfer_(automatic),65.15,4681.75,0
2389,6161-erdgd,male,0,yes,yes,71,yes,yes,dsl,yes,...,yes,yes,yes,yes,one_year,no,electronic_check,85.45,6300.85,0
3676,2364-ufrom,male,0,no,no,30,yes,no,dsl,yes,...,no,yes,yes,no,one_year,no,electronic_check,70.4,2044.75,0


we want to split the data into three parts:
* train
* validation
* test

since train_test_split splits the data into only two parts: train and test, we will take the df_train_full dataframe and split it one more time into train and validation.

In [13]:
df_train, df_val = train_test_split(df_train_full, test_size=0.33, 
                                    random_state=11)

y_train = df_train.churn.values
y_val = df_val.churn.values

del df_train['churn']
del df_val['churn']

The dataframes are now prepared and we can use the training dataset for performing initial exploritory data analysis (EDA).

In [14]:
df_train_full.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

we get all zeros so we know we have no missing values

we also need to check the distribution of values in the target variable. We can do that with the value_counts() method.

In [15]:
df_train_full.churn.value_counts()

0    4113
1    1521
Name: churn, dtype: int64

The first column is the value of the target variable, and the second is the count. we can see the majority of the customers didn't churn. 

In [16]:
1521/5634

0.26996805111821087

We can also calculate the churn rate thats more convenient  with the mean() method.

In [17]:
global_mean = df_train_full.churn.mean()
global_mean

0.26996805111821087

In [18]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod']
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [19]:
df_train_full[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

In [20]:
female_mean = df_train_full[df_train_full.gender == 'female'].churn.mean()

In [21]:
male_mean = df_train_full[df_train_full.gender == 'male'].churn.mean()

In [22]:
female_mean


0.27682403433476394

In [23]:
male_mean

0.2632135306553911

In [24]:
female_mean - male_mean

0.013610503679372832

In [25]:
partner_yes = df_train_full[df_train_full.partner == 'yes'].churn.mean()
print('partner == yes:', round(partner_yes, 3))

partner_no = df_train_full[df_train_full.partner == 'no'].churn.mean()
print('partner == no:', round(partner_no, 3))

partner == yes: 0.205
partner == no: 0.33


risk ratio

risk = group rate / global rate

ex. for gender == female
risk = 27.2% / 27% = 1.02

To check all the values a variable has and compute the churn rate for each of these valeus we will use a rough pandas translation of the below sql

SELECT
    gender, AVG(churn),
    AVG(churn) - global_churn, 
    AVG(churn) / global_churn
FROM 
    data
GROUP BY
    gender

In [26]:
global_mean = df_train_full.churn.mean()

df_group = df_train_full.groupby(by='gender').churn.agg(['mean'])
df_group['diff'] = df_group['mean']- global_mean
df_group['risk'] = df_group['mean'] / global_mean

df_group

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,0.006856,1.025396
male,0.263214,-0.006755,0.97498


now do the same thing for categorical variables. 

In [27]:
from IPython.display import display

for col in categorical:
    df_group = df_train_full.groupby(by=col).churn.agg(['mean'])
    df_group['diff'] = df_group['mean'] - global_mean
    df_group['rate'] = df_group['mean'] / global_mean
    display(df_group)

Unnamed: 0_level_0,mean,diff,rate
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,0.006856,1.025396
male,0.263214,-0.006755,0.97498


Unnamed: 0_level_0,mean,diff,rate
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.24227,-0.027698,0.897403
1,0.413377,0.143409,1.531208


Unnamed: 0_level_0,mean,diff,rate
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.329809,0.059841,1.221659
yes,0.205033,-0.064935,0.759472


Unnamed: 0_level_0,mean,diff,rate
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.31376,0.043792,1.162212
yes,0.165666,-0.104302,0.613651


Unnamed: 0_level_0,mean,diff,rate
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.241316,-0.028652,0.89387
yes,0.273049,0.003081,1.011412


Unnamed: 0_level_0,mean,diff,rate
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.257407,-0.012561,0.953474
no_phone_service,0.241316,-0.028652,0.89387
yes,0.290742,0.020773,1.076948


Unnamed: 0_level_0,mean,diff,rate
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.192347,-0.077621,0.712482
fiber_optic,0.425171,0.155203,1.574895
no,0.077805,-0.192163,0.288201


Unnamed: 0_level_0,mean,diff,rate
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.420921,0.150953,1.559152
no_internet_service,0.077805,-0.192163,0.288201
yes,0.153226,-0.116742,0.56757


Unnamed: 0_level_0,mean,diff,rate
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.404323,0.134355,1.497672
no_internet_service,0.077805,-0.192163,0.288201
yes,0.217232,-0.052736,0.80466


Unnamed: 0_level_0,mean,diff,rate
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.395875,0.125907,1.466379
no_internet_service,0.077805,-0.192163,0.288201
yes,0.230412,-0.039556,0.85348


Unnamed: 0_level_0,mean,diff,rate
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.418914,0.148946,1.551717
no_internet_service,0.077805,-0.192163,0.288201
yes,0.159926,-0.110042,0.59239


Unnamed: 0_level_0,mean,diff,rate
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.342832,0.072864,1.269897
no_internet_service,0.077805,-0.192163,0.288201
yes,0.302723,0.032755,1.121328


Unnamed: 0_level_0,mean,diff,rate
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.338906,0.068938,1.255358
no_internet_service,0.077805,-0.192163,0.288201
yes,0.307273,0.037305,1.138182


Unnamed: 0_level_0,mean,diff,rate
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.431701,0.161733,1.599082
one_year,0.120573,-0.149395,0.446621
two_year,0.028274,-0.241694,0.10473


Unnamed: 0_level_0,mean,diff,rate
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.172071,-0.097897,0.637375
yes,0.338151,0.068183,1.25256


Unnamed: 0_level_0,mean,diff,rate
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank_transfer_(automatic),0.168171,-0.101797,0.622928
credit_card_(automatic),0.164339,-0.10563,0.608733
electronic_check,0.45589,0.185922,1.688682
mailed_check,0.19387,-0.076098,0.718121


In [28]:
from sklearn.metrics import mutual_info_score

def calculate_mi(series):
    return mutual_info_score(series, df_train_full.churn)

df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi

Unnamed: 0,MI
contract,0.09832
onlinesecurity,0.063085
techsupport,0.061032
internetservice,0.055868
onlinebackup,0.046923
deviceprotection,0.043453
paymentmethod,0.04321
streamingtv,0.031853
streamingmovies,0.031581
paperlessbilling,0.017589


In [29]:
df_train_full[numerical].corrwith(df_train_full.churn)

tenure           -0.351885
monthlycharges    0.196805
dtype: float64

In [30]:
train_dict = df_train[categorical + numerical].to_dict(orient='records')

In [31]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

DictVectorizer(sparse=False)

In [32]:
X_train = dv.transform(train_dict)

In [33]:
X_train[0]

array([0., 0., 1., ..., 0., 0., 0.])

In [34]:
dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', ..., 'totalcharges=999.45',
       'totalcharges=999.9', 'totalcharges=_'], dtype=object)

In [35]:
def logistic_regression(xi):
    score = bias
    for j in range(n):
        score = score +xi[j] * w[j]
    prob = sigmoid(score)

In [36]:
import math
def sigmoid(score):
    return 1 / (1 + math.exp(-score))

In [37]:
from sklearn.linear_model import LogisticRegression

In [38]:
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train, y_train)

LogisticRegression(random_state=1, solver='liblinear')

In [39]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [40]:
y_pred = model.predict_proba(X_val)

In [41]:
y_pred = model.predict_proba(X_val)[:, 1]

In [42]:
churn = y_pred >= 0.5

In [43]:
(y_val == churn).mean()

0.8064516129032258

To see which feature is associated with each weight, use the get_feature_names method of the DictVectorizer. Then zip the feature names together with the coefficients before looking at them

In [44]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

{'contract=month-to-month': 0.59,
 'contract=one_year': 0.006,
 'contract=two_year': -0.735,
 'dependents=no': -0.021,
 'dependents=yes': -0.119,
 'deviceprotection=no': 0.077,
 'deviceprotection=no_internet_service': -0.18,
 'deviceprotection=yes': -0.037,
 'gender=female': -0.034,
 'gender=male': -0.105,
 'internetservice=dsl': -0.43,
 'internetservice=fiber_optic': 0.471,
 'internetservice=no': -0.18,
 'monthlycharges': -0.001,
 'multiplelines=no': -0.191,
 'multiplelines=no_phone_service': 0.018,
 'multiplelines=yes': 0.034,
 'onlinebackup=no': 0.103,
 'onlinebackup=no_internet_service': -0.18,
 'onlinebackup=yes': -0.062,
 'onlinesecurity=no': 0.24,
 'onlinesecurity=no_internet_service': -0.18,
 'onlinesecurity=yes': -0.199,
 'paperlessbilling=no': -0.227,
 'paperlessbilling=yes': 0.087,
 'partner=no': -0.055,
 'partner=yes': -0.084,
 'paymentmethod=bank_transfer_(automatic)': -0.046,
 'paymentmethod=credit_card_(automatic)': -0.148,
 'paymentmethod=electronic_check': 0.19,
 'paym

Tp understand how the model works and to build the intution. let's train a simpler and smaller model that uses only three variables: congtract, tenure, and totalcharges. 

we will redo the same steps we did for training, this time using a smaller set of features. (no additional preprocessing for tenure and totalcharges since they are numneric. contract is categorical so we need to apply one-hot encoding)

In [45]:
# small_subset = ['contract', 'tenure', 'totalcharges']
# train_dict_small = df_train[small_subset].to_dict(orient='records')
# dv_small = DictVectorizer(sparse=False)
# dv_small.fit(train_dict_small)

# X_small_train = dv_small.transform(train_dict_small)

In [46]:
# dv_small.get_feature_names_out()

In [47]:
# model_small = LogisticRegression(solver='liblinear', random_state=1)
# model_small.fit(X_small_train, y_train)

In [48]:
# model_small.intercept_[0]

In [49]:
# dict(zip(dv_small.get_feature_names_out(), model_small.coef_[0].round(3)))

In [52]:
customer ={
    'customerid': '88979-zkjof', 
    'gender': 'female', 
    'seniorcitizen': 0,
    'partner': 'no',
    'dependents': 'no',
    'tenure': 41,
    'phoneservice': 'yes',
    'multiplelines': 'no',
    'internetservice': 'dsl',
    'onlinesecurity': 'yes',
    'onlinebackup': 'no'
}

In [56]:
X_test = dv.transform([customer])

In [57]:
model.predict_proba(X_test)

array([[0.92503734, 0.07496266]])

In [58]:
model.predict_proba(X_test)[0, 1]

0.07496265972081616