In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Telco Churn data

Using Logistic Regression to learn about churn

In [2]:
df = pd.read_csv("data/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [5]:
df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [6]:
# check if there are only Nos and Yesses
df["Partner"].value_counts()

No     3641
Yes    3402
Name: Partner, dtype: int64

Convert booleans that are strings to int representation

In [7]:
# check if there are only Nos and Yesses
df["Partner"].value_counts()

No     3641
Yes    3402
Name: Partner, dtype: int64

In [8]:
df["Partner"] = np.where(df["Partner"].eq("Yes"), 1, 0)

Convert remaining columns

In [9]:
df["PaperlessBilling"].value_counts()

Yes    4171
No     2872
Name: PaperlessBilling, dtype: int64

In [10]:
to_convert = ["Dependents",
              "PhoneService",
              "OnlineSecurity",
              "OnlineBackup",
              "DeviceProtection",
              "TechSupport",
              "StreamingTV",
              "StreamingMovies",
              "PaperlessBilling",
              "Churn"]

df[to_convert] = np.where(df[to_convert].eq("Yes"), 1, 0)

Convert TotalCharges to float

In [11]:
df["TotalCharges"].value_counts()

20.2       11
           11
19.75       9
19.9        8
20.05       8
           ..
6056.15     1
6741.15     1
5784.3      1
2043.45     1
669.85      1
Name: TotalCharges, Length: 6531, dtype: int64

Seems like the first entry is some empty string or whitespace

In [12]:
df["TotalCharges"].value_counts().index[0]

'20.2'

It's a whitespace

In [13]:
df["TotalCharges"] = df["TotalCharges"].replace(" ",0.0).astype(float)

In [14]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
PhoneService          int64
MultipleLines        object
InternetService      object
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract             object
PaperlessBilling      int64
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                 int64
dtype: object

Convert categorical variables

In [15]:
df["Contract"].value_counts()

Month-to-month    3875
Two year          1695
One year          1473
Name: Contract, dtype: int64

In [16]:
df["Contract"].head(10)

0    Month-to-month
1          One year
2    Month-to-month
3          One year
4    Month-to-month
5    Month-to-month
6    Month-to-month
7    Month-to-month
8    Month-to-month
9          One year
Name: Contract, dtype: object

In [17]:
pd.get_dummies(df["Contract"].head(10), prefix="Contract", drop_first=True)

Unnamed: 0,Contract_One year
0,0
1,1
2,0
3,1
4,0
5,0
6,0
7,0
8,0
9,1


In [18]:
df = pd.concat([df, pd.get_dummies(df["Contract"],
                                   prefix="Contract",
                                   drop_first=True)],
               axis=1)

In [19]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Contract_One year,Contract_Two year
0,7590-VHVEG,Female,0,1,0,1,0,No phone service,DSL,0,...,0,0,Month-to-month,1,Electronic check,29.85,29.85,0,0,0
1,5575-GNVDE,Male,0,0,0,34,1,No,DSL,1,...,0,0,One year,0,Mailed check,56.95,1889.5,0,1,0
2,3668-QPYBK,Male,0,0,0,2,1,No,DSL,1,...,0,0,Month-to-month,1,Mailed check,53.85,108.15,1,0,0
3,7795-CFOCW,Male,0,0,0,45,0,No phone service,DSL,1,...,0,0,One year,0,Bank transfer (automatic),42.3,1840.75,0,1,0
4,9237-HQITU,Female,0,0,0,2,1,No,Fiber optic,0,...,0,0,Month-to-month,1,Electronic check,70.7,151.65,1,0,0


Create some other features

In [20]:
# count number of servies
services = ["PhoneService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"]
df = df.assign(no_of_services=df[services].sum(axis=1))

In [21]:
# check out PaymentMethod
df["PaymentMethod"].value_counts()

Electronic check             2365
Mailed check                 1612
Bank transfer (automatic)    1544
Credit card (automatic)      1522
Name: PaymentMethod, dtype: int64

In [22]:
# group automatic and manual payment methods and create one dummy
df = df.assign(AutomaticPayment=np.where(df["PaymentMethod"].str.contains("automatic"), 1, 0))

Run Logistic Regression

In [23]:
df.dtypes

customerID            object
gender                object
SeniorCitizen          int64
Partner                int64
Dependents             int64
tenure                 int64
PhoneService           int64
MultipleLines         object
InternetService       object
OnlineSecurity         int64
OnlineBackup           int64
DeviceProtection       int64
TechSupport            int64
StreamingTV            int64
StreamingMovies        int64
Contract              object
PaperlessBilling       int64
PaymentMethod         object
MonthlyCharges       float64
TotalCharges         float64
Churn                  int64
Contract_One year      uint8
Contract_Two year      uint8
no_of_services         int64
AutomaticPayment       int64
dtype: object

In [24]:
variables = ["no_of_services",
             "AutomaticPayment",
             "tenure",
             "MonthlyCharges",
             "SeniorCitizen",
             "OnlineBackup",
             "Contract_One year",
             "Contract_Two year"]
dep_variable = "Churn"

In [25]:
df = sm.add_constant(df)

  return ptp(axis=axis, out=out, **kwargs)


In [26]:
X = df[["const"] + variables]
y = df[dep_variable]

log_reg = sm.Logit(y, X).fit()

Optimization terminated successfully.
         Current function value: 0.427586
         Iterations 8


In [27]:
log_reg.summary()

0,1,2,3
Dep. Variable:,Churn,No. Observations:,7043.0
Model:,Logit,Df Residuals:,7034.0
Method:,MLE,Df Model:,8.0
Date:,"Mon, 10 Feb 2020",Pseudo R-squ.:,0.261
Time:,14:52:54,Log-Likelihood:,-3011.5
converged:,True,LL-Null:,-4075.1
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.6319,0.092,-17.711,0.000,-1.812,-1.451
no_of_services,-0.2739,0.036,-7.565,0.000,-0.345,-0.203
AutomaticPayment,-0.3070,0.071,-4.312,0.000,-0.447,-0.167
tenure,-0.0307,0.002,-14.152,0.000,-0.035,-0.026
MonthlyCharges,0.0378,0.002,19.937,0.000,0.034,0.042
SeniorCitizen,0.3743,0.082,4.586,0.000,0.214,0.534
OnlineBackup,0.0024,0.082,0.029,0.977,-0.159,0.163
Contract_One year,-0.7851,0.104,-7.516,0.000,-0.990,-0.580
Contract_Two year,-1.5868,0.171,-9.263,0.000,-1.923,-1.251


In [28]:
log_reg.predict(df[["const"] + variables])

0       0.308982
1       0.106349
2       0.383321
3       0.034635
4       0.669813
          ...   
7038    0.169443
7039    0.083284
7040    0.245294
7041    0.761372
7042    0.039184
Length: 7043, dtype: float64

In [29]:
df

Unnamed: 0,const,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,...,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Contract_One year,Contract_Two year,no_of_services,AutomaticPayment
0,1.0,7590-VHVEG,Female,0,1,0,1,0,No phone service,DSL,...,Month-to-month,1,Electronic check,29.85,29.85,0,0,0,1,0
1,1.0,5575-GNVDE,Male,0,0,0,34,1,No,DSL,...,One year,0,Mailed check,56.95,1889.50,0,1,0,3,0
2,1.0,3668-QPYBK,Male,0,0,0,2,1,No,DSL,...,Month-to-month,1,Mailed check,53.85,108.15,1,0,0,3,0
3,1.0,7795-CFOCW,Male,0,0,0,45,0,No phone service,DSL,...,One year,0,Bank transfer (automatic),42.30,1840.75,0,1,0,3,1
4,1.0,9237-HQITU,Female,0,0,0,2,1,No,Fiber optic,...,Month-to-month,1,Electronic check,70.70,151.65,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1.0,6840-RESVB,Male,0,1,1,24,1,Yes,DSL,...,One year,1,Mailed check,84.80,1990.50,0,1,0,6,0
7039,1.0,2234-XADUH,Female,0,1,1,72,1,Yes,Fiber optic,...,One year,1,Credit card (automatic),103.20,7362.90,0,1,0,5,1
7040,1.0,4801-JZAZL,Female,0,1,1,11,0,No phone service,DSL,...,Month-to-month,1,Electronic check,29.60,346.45,0,0,0,1,0
7041,1.0,8361-LTMKD,Male,1,1,0,4,1,Yes,Fiber optic,...,Month-to-month,1,Mailed check,74.40,306.60,1,0,0,1,0
