In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [32]:
df = pd.read_csv('customers_data.csv')

In [33]:
len(df)

7043

In [34]:
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [35]:
df.head().T
# churn is the target variables. We want to learn
# to predict it

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [36]:
df.dtypes
# `TotalCharges` is object (string), but we want it
# to be numeric. Why does pd think it's object?
# => Special encoding for missing values. `to_numeric()`
# will convert it to `int` and `coerce` will replace ''
# rows with `NaN`

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [37]:
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')
# here we put `0` where NaN appears
df.TotalCharges = df.TotalCharges.fillna(0)

In [38]:
# as in `linear_regression` deal with columns
# and data as well
df.columns = df.columns.str.lower().str.replace(' ','_')

string_column = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_column:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [39]:
# we need `churn` to be binary
df.churn = (df.churn == 'yes').astype(int)
# True if `== 'yes' else False

In [40]:
# `train_test_split` initially can split only in 2 datasets,
# but we can take one of them and repeat the process. Also
# this function does all the shuffle for us before splitting
full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_valid = train_test_split(full_train, test_size=0.33, random_state=11)

y_train = df_train.churn.values
y_valid = df_valid.churn.values

del df_train['churn']
del df_valid['churn']

# Checking for missing values is crucial.
# We've dealt with `totalcharges`, now
# let's look whether other columns
# have similar issue

In [41]:
full_train.isnull().sum()
# no more missing values

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [42]:
full_train.churn.value_counts()

0    4113
1    1521
Name: churn, dtype: int64

In [43]:
1521 / (4113 + 1521)
# prob. that a customer churned

0.26996805111821087

In [44]:
# another way of calc. churn rate is by `.mean()`
global_mean = full_train.churn.mean()
# it sums `churn` column (and we have 1 and 0 -> 1 will be summed)
# hence we get the number of people who churned (prob.)
round(global_mean, 4)

0.27

In [45]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
               'phoneservice', 'multiplelines', 'internetservice',
               'onlinesecurity', 'onlinebackup', 'deviceprotection',
               'techsupport', 'streamingtv', 'streamingmovies',
               'contract', 'paperlessbilling', 'paymentmethod']
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [46]:
full_train[categorical].nunique()
# as we see our categorical variables have few
# unique values

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

# Feature importance analysis

In [47]:
fem_mean = full_train[full_train.gender == 'female'].churn.mean()
round(fem_mean, 4)

0.2768

In [48]:
mal_mean = full_train[full_train.gender == 'male'].churn.mean()
round(mal_mean, 4)

0.2632

In [49]:
f"{round((0.2768 / 0.2632), 4)}; {round((0.2632 / 0.2768), 4)}"
# female group is a little bit more risky than male one

'1.0517; 0.9509'

In [50]:
full_train[full_train.partner == 'yes'].mean()

seniorcitizen        0.166543
tenure              42.076980
monthlycharges      67.772150
totalcharges      3020.107124
churn                0.205033
dtype: float64

In [51]:
partner_yes = full_train[full_train.partner == 'yes'].churn.mean()
partner_no = full_train[full_train.partner == 'no'].churn.mean()
print(f"Partner: {round(partner_yes, 4)}; No partner: {round(partner_no, 3)}")

Partner: 0.205; No partner: 0.33


In [59]:
global_churn = full_train.churn.mean()
partner_yes / global_churn

0.7594724924338315

In [57]:
partner_no / global_churn

1.2216593879412643

In [55]:
global_churn = full_train.churn.mean()

df_group = full_train.groupby(by='gender').churn.agg(['mean'])
# (above) resembles code where we look at genders separately
df_group['diff'] = df_group['mean'] - global_churn
df_group['risk'] = df_group['mean'] / global_churn

df_group

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,0.006856,1.025396
male,0.263214,-0.006755,0.97498


In [58]:
from IPython.display import display

for i in categorical:
    df_group = full_train.groupby(by=i).churn.agg(['mean'])
    # first col. is churn rate for the group. We need
    # to compare it with `global_churn`. But without manual
    # check we can do so below in `rate` column.
    df_group['diff'] = df_group['mean'] - global_churn
    df_group['risk'] = df_group['mean'] / global_churn
    
    display(df_group)

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,0.006856,1.025396
male,0.263214,-0.006755,0.97498


Unnamed: 0_level_0,mean,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.24227,-0.027698,0.897403
1,0.413377,0.143409,1.531208


Unnamed: 0_level_0,mean,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.329809,0.059841,1.221659
yes,0.205033,-0.064935,0.759472


Unnamed: 0_level_0,mean,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.31376,0.043792,1.162212
yes,0.165666,-0.104302,0.613651


Unnamed: 0_level_0,mean,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.241316,-0.028652,0.89387
yes,0.273049,0.003081,1.011412


Unnamed: 0_level_0,mean,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.257407,-0.012561,0.953474
no_phone_service,0.241316,-0.028652,0.89387
yes,0.290742,0.020773,1.076948


Unnamed: 0_level_0,mean,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.192347,-0.077621,0.712482
fiber_optic,0.425171,0.155203,1.574895
no,0.077805,-0.192163,0.288201


Unnamed: 0_level_0,mean,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.420921,0.150953,1.559152
no_internet_service,0.077805,-0.192163,0.288201
yes,0.153226,-0.116742,0.56757


Unnamed: 0_level_0,mean,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.404323,0.134355,1.497672
no_internet_service,0.077805,-0.192163,0.288201
yes,0.217232,-0.052736,0.80466


Unnamed: 0_level_0,mean,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.395875,0.125907,1.466379
no_internet_service,0.077805,-0.192163,0.288201
yes,0.230412,-0.039556,0.85348


Unnamed: 0_level_0,mean,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.418914,0.148946,1.551717
no_internet_service,0.077805,-0.192163,0.288201
yes,0.159926,-0.110042,0.59239


Unnamed: 0_level_0,mean,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.342832,0.072864,1.269897
no_internet_service,0.077805,-0.192163,0.288201
yes,0.302723,0.032755,1.121328


Unnamed: 0_level_0,mean,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.338906,0.068938,1.255358
no_internet_service,0.077805,-0.192163,0.288201
yes,0.307273,0.037305,1.138182


Unnamed: 0_level_0,mean,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.431701,0.161733,1.599082
one_year,0.120573,-0.149395,0.446621
two_year,0.028274,-0.241694,0.10473


Unnamed: 0_level_0,mean,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.172071,-0.097897,0.637375
yes,0.338151,0.068183,1.25256


Unnamed: 0_level_0,mean,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank_transfer_(automatic),0.168171,-0.101797,0.622928
credit_card_(automatic),0.164339,-0.10563,0.608733
electronic_check,0.45589,0.185922,1.688682
mailed_check,0.19387,-0.076098,0.718121


# Let's analyze the case when we
# compare two groups between each other

In [341]:
0.329809/0.205033

1.6085654504396856

In [342]:
0.205033/0.329809

0.6216719373940675

# Read below. Essential

In [345]:
from sklearn.metrics import mutual_info_score

def calculate_MI(series):
    return mutual_info_score(series, full_train.churn)

calculate_MI = full_train[categorical].apply(calculate_MI)
# data in calculate_MI() is DF on which we apply data
# calculate_MI
calculate_MI = calculate_MI.sort_values(ascending=False).to_frame(name='Mut. Info')

In [346]:
calculate_MI.head(5)

Unnamed: 0,Mut. Info
contract,0.09832
onlinesecurity,0.063085
techsupport,0.061032
internetservice,0.055868
onlinebackup,0.046923


In [347]:
calculate_MI.tail(5)

Unnamed: 0,Mut. Info
partner,0.009968
seniorcitizen,0.00941
multiplelines,0.000857
phoneservice,0.000229
gender,0.000117


In [243]:
full_train[numerical].corrwith(full_train.churn).to_frame('corr')
# tenure: the longer customers stay the fewer chances they churn
# monthlycharges: 
# monthlycharges: the more people pay the more likely they'll churn
# totalcharges: the longer people stay the more they pay
# hence less likely they'll leave. That's why here correlation
# is negative

Unnamed: 0,corr
tenure,-0.351885
monthlycharges,0.196805
totalcharges,-0.196353


In [426]:
full_train.groupby(by='churn')[numerical].agg(['mean'])

Unnamed: 0_level_0,tenure,monthlycharges,totalcharges
Unnamed: 0_level_1,mean,mean,mean
churn,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,37.531972,61.176477,2548.021627
1,18.070348,74.521203,1545.689415


In [422]:
full_train.groupby(by='churn')[numerical].mean()

Unnamed: 0_level_0,tenure,monthlycharges,totalcharges
churn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,37.531972,61.176477,2548.021627
1,18.070348,74.521203,1545.689415


In [348]:
temp = df['monthlycharges']
temp = pd.to_numeric(temp, errors='coerce')

lower = temp[temp <= 20].to_frame()
middle = temp[temp.between(20, 50, inclusive=False)].to_frame()
upper = temp[temp > 50].to_frame()

In [349]:
lower.corrwith(full_train.churn).to_frame('corr')

Unnamed: 0,corr
monthlycharges,0.030408


In [350]:
middle.corrwith(full_train.churn).to_frame('corr')

Unnamed: 0,corr
monthlycharges,0.229385


In [351]:
upper.corrwith(full_train.churn).to_frame('corr')

Unnamed: 0,corr
monthlycharges,0.053326


# One-hot encoding

In [259]:
train_dict = df_train[categorical + numerical].to_dict(orient='records')

In [260]:
train_dict[0]

{'gender': 'male',
 'seniorcitizen': 0,
 'partner': 'yes',
 'dependents': 'no',
 'phoneservice': 'yes',
 'multiplelines': 'no',
 'internetservice': 'dsl',
 'onlinesecurity': 'yes',
 'onlinebackup': 'yes',
 'deviceprotection': 'yes',
 'techsupport': 'yes',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'two_year',
 'paperlessbilling': 'yes',
 'paymentmethod': 'bank_transfer_(automatic)',
 'tenure': 71,
 'monthlycharges': 86.1,
 'totalcharges': 6045.9}

In [261]:
# fit() will do one-hot encoding to categorical variables
# but leave numeric untouched
dv = DictVectorizer(sparse=False)
# `sparse` means we'll create NumPy array and not sparse matrix
dv.fit(train_dict)
# `fit()` does train to predict values for each key
# in dict. and also learn how to map those values
# to the cols. in the output matrix

DictVectorizer(sparse=False)

# Example how DictVectorizer will transform the dict into encoding

In [293]:
# after we've done fitting, we can `transform` dict to a matrix
X_train = dv.transform(train_dict)
X_train

array([[0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        7.10000e+01, 6.04590e+03],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        6.00000e+01, 6.02900e+03],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        4.60000e+01, 2.06515e+03],
       ...,
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.00000e+00, 2.83000e+01],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.30000e+01, 4.70600e+02],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        6.40000e+01, 5.32725e+03]])

In [263]:
X_train.shape

(3774, 45)

In [264]:
X_train[0]
# 0 and 1 are hot-encoded categorical variables
# whilst > 1 are numeric variables

array([0.0000e+00, 0.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00,
       1.0000e+00, 0.0000e+00, 0.0000e+00, 8.6100e+01, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 7.1000e+01, 6.0459e+03])

In [265]:
dv.get_feature_names()
# pay attention to how `contract` is morphed into
# multiple columns. `Dependents` col is also changed into 2
# !! Numeric cols stay the same. You can look at more clear
# example above

['contract=month-to-month',
 'contract=one_year',
 'contract=two_year',
 'dependents=no',
 'dependents=yes',
 'deviceprotection=no',
 'deviceprotection=no_internet_service',
 'deviceprotection=yes',
 'gender=female',
 'gender=male',
 'internetservice=dsl',
 'internetservice=fiber_optic',
 'internetservice=no',
 'monthlycharges',
 'multiplelines=no',
 'multiplelines=no_phone_service',
 'multiplelines=yes',
 'onlinebackup=no',
 'onlinebackup=no_internet_service',
 'onlinebackup=yes',
 'onlinesecurity=no',
 'onlinesecurity=no_internet_service',
 'onlinesecurity=yes',
 'paperlessbilling=no',
 'paperlessbilling=yes',
 'partner=no',
 'partner=yes',
 'paymentmethod=bank_transfer_(automatic)',
 'paymentmethod=credit_card_(automatic)',
 'paymentmethod=electronic_check',
 'paymentmethod=mailed_check',
 'phoneservice=no',
 'phoneservice=yes',
 'seniorcitizen',
 'streamingmovies=no',
 'streamingmovies=no_internet_service',
 'streamingmovies=yes',
 'streamingtv=no',
 'streamingtv=no_internet_servic

# Logistic Regression

In [266]:
import math

def sigmoid(score):
    return 1 / (1 + math.exp(-score))

In [267]:
def logistic_regression(xi):
    score = bias
    for j in range(n):
        score += xi[j] * w[j]
        
    prob = sigmoid(score)
    return prob

In [268]:
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train, y_train)

LogisticRegression(random_state=1, solver='liblinear')

In [271]:
# after having trained the model, we'll prepare `validation` dataset
# and apply our model on it to observe the results.
# instead of `fit` and then `transform`, we'll use `transform`
# which we've already fitted before
val_dict = df_valid[numerical + categorical].to_dict(orient='records')
X_valid = dv.transform(val_dict)

In [277]:
y_pred = model.predict_proba(X_valid)
y_pred
# first column: prob. that target is negative (no churn)
# second column: prob. that target is positive(churn)

array([[0.76509452, 0.23490548],
       [0.73114964, 0.26885036],
       [0.68055068, 0.31944932],
       ...,
       [0.94275132, 0.05724868],
       [0.3847724 , 0.6152276 ],
       [0.93872722, 0.06127278]])

In [278]:
# as those 2 cols depict the same info, we don't need both
# as `p` - churn hence `1 - p` - no churn
y_pred = y_pred[:, 1]
y_pred

array([0.23490548, 0.26885036, 0.31944932, ..., 0.05724868, 0.6152276 ,
       0.06127278])

In [282]:
churn = y_pred >= 0.5
churn

array([False, False, False, ..., False,  True, False])

In [283]:
y_valid

array([0, 1, 0, ..., 0, 0, 0])

In [281]:
(y_valid == churn).mean()
# NumPy will cast False/True to 0/1 and then
# the comparison will be made.
# Then, mean(), at first, will cast bool to integers
# and, secondly, fraction of `1` out of len(array)
# is taken (we have only 1 or 0).

0.8016129032258065

In [287]:
model.intercept_[0]
# bias of the model

-0.12198811467233629

In [285]:
# rest of the weights
dict(zip(dv.get_feature_names(), model.coef_[0].round(3)))

{'contract=month-to-month': 0.563,
 'contract=one_year': -0.086,
 'contract=two_year': -0.599,
 'dependents=no': -0.03,
 'dependents=yes': -0.092,
 'deviceprotection=no': 0.1,
 'deviceprotection=no_internet_service': -0.116,
 'deviceprotection=yes': -0.106,
 'gender=female': -0.027,
 'gender=male': -0.095,
 'internetservice=dsl': -0.323,
 'internetservice=fiber_optic': 0.317,
 'internetservice=no': -0.116,
 'monthlycharges': 0.001,
 'multiplelines=no': -0.168,
 'multiplelines=no_phone_service': 0.127,
 'multiplelines=yes': -0.081,
 'onlinebackup=no': 0.136,
 'onlinebackup=no_internet_service': -0.116,
 'onlinebackup=yes': -0.142,
 'onlinesecurity=no': 0.258,
 'onlinesecurity=no_internet_service': -0.116,
 'onlinesecurity=yes': -0.264,
 'paperlessbilling=no': -0.213,
 'paperlessbilling=yes': 0.091,
 'partner=no': -0.048,
 'partner=yes': -0.074,
 'paymentmethod=bank_transfer_(automatic)': -0.027,
 'paymentmethod=credit_card_(automatic)': -0.136,
 'paymentmethod=electronic_check': 0.175,


# Digression to understand how model works

In [431]:
small_subset = ['contract', 'tenure', 'totalcharges']

In [432]:
train_dict_small = df_train[small_subset].to_dict(orient='records')
dv_small = DictVectorizer(sparse=False)
dv_small.fit(train_dict_small)

DictVectorizer(sparse=False)

In [433]:
X_small_train = dv_small.transform(train_dict_small)
X_small_train

array([[0.00000e+00, 0.00000e+00, 1.00000e+00, 7.10000e+01, 6.04590e+03],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, 6.00000e+01, 6.02900e+03],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, 4.60000e+01, 2.06515e+03],
       ...,
       [0.00000e+00, 1.00000e+00, 0.00000e+00, 2.00000e+00, 2.83000e+01],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, 2.30000e+01, 4.70600e+02],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, 6.40000e+01, 5.32725e+03]])

In [434]:
dv_small.get_feature_names()

['contract=month-to-month',
 'contract=one_year',
 'contract=two_year',
 'tenure',
 'totalcharges']

In [435]:
model_small = LogisticRegression(solver='liblinear', random_state=1)
model_small.fit(X_small_train, y_train)

LogisticRegression(random_state=1, solver='liblinear')

In [438]:
small_valid = df_valid[small_subset].to_dict(orient='records')
X_small_set = dv_small.transform(small_valid)
y_smallValid = model_small.predict_proba(X_small_set)[:, 1]

In [440]:
small_churn = y_smallValid >= 0.5
(y_valid == small_churn).mean()

0.7672043010752688

In [302]:
model_small.intercept_[0]

-0.5772299133614702

In [303]:
dict(zip(dv_small.get_feature_names(), model_small.coef_[0].round(3)))

{'contract=month-to-month': 0.866,
 'contract=one_year': -0.327,
 'contract=two_year': -1.117,
 'tenure': -0.094,
 'totalcharges': 0.001}

In [401]:
# let's calculate risk ratio for 'contract'
i = 'contract'
df_group = full_train.groupby(by=i).churn.agg(['mean'])
df_group['diff'] = df_group['mean'] - global_churn
df_group['risk'] = df_group['mean'] / global_churn
display(df_group)
# month-to-month is the most risky

Unnamed: 0_level_0,mean,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.431701,0.161733,1.599082
one_year,0.120573,-0.149395,0.446621
two_year,0.028274,-0.241694,0.10473


In [314]:
df_train[['tenure']].corrwith(full_train.churn).to_frame('corr')
# tenure has weight of -0.094. It does mean every new month
# will decrease prob. of customer churning by 0.094. Correlation also
# tells us that dependency is inverse.

Unnamed: 0,corr
tenure,-0.356965


In [315]:
1 / (1 + math.exp(-(-0.893)))

0.2904911209356156

In [316]:
1 / (1 + math.exp(-(-2.823)))
# Prob. of this customer to churn is even lower

0.056093879808269666

# Using the model

In [317]:
customer = {
    'customerid': '8879-zkjof',
    'gender': 'female',
    'seniorcitizen': 0,
    'partner': 'no',
    'dependents': 'no',
    'tenure': 41,
    'phoneservice': 'yes',
    'multiplelines': 'no',
    'internetservice': 'dsl',
    'onlinesecurity': 'yes',
    'onlinebackup': 'no',
    'deviceprotection': 'yes',
    'techsupport': 'yes',
    'streamingtv': 'yes',
    'streamingmovies': 'yes',
    'contract': 'one_year',
    'paperlessbilling': 'yes',
    'paymentmethod': 'bank_transfer_(automatic)',
    'monthlycharges': 79.85,
    'totalcharges': 3320.75,
}

In [320]:
X_test = dv.transform([customer])

In [324]:
model.predict_proba(X_test)
# first column: prob. that target is negative (no churn)
# second column: prob. that target is positive(churn)

array([[0.92667423, 0.07332577]])

In [325]:
model.predict_proba(X_test)[0, 1]
# prob. of customer to churn is less than 50%
# => we won't send this customer a promotional letter

0.07332577315357781

In [326]:
customer = {
    'gender': 'female',
    'seniorcitizen': 1,
    'partner': 'no',
    'dependents': 'no',
    'phoneservice': 'yes',
    'multiplelines': 'yes',
    'internetservice': 'fiber_optic',
    'onlinesecurity': 'no',
    'onlinebackup': 'no',
    'deviceprotection': 'no',
    'techsupport': 'no',
    'streamingtv': 'yes',
    'streamingmovies': 'no',
    'contract': 'month-to-month',
    'paperlessbilling': 'yes',
    'paymentmethod': 'electronic_check',
    'tenure': 1,
    'monthlycharges': 85.7,
    'totalcharges': 85.7
}

In [327]:
X_test = dv.transform([customer])

In [328]:
model.predict_proba(X_test)[0, 1]
# prob. of customer to churn is more than 50%
# => we will send this customer a promotional letter

0.8321638622459152