In [1]:
import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt


# Project: Churn Prediction

## Dataset: 


In [2]:
df = pd.read_csv('churn_data.csv')
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


# Data Preparation

* Look at the data
* Make column names and values look uniform
* Check if all the columns read correctly
* Check if the churn variable needs any preparation

In [3]:
df 

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [4]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [5]:
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for column in categorical_columns:
    df[column] = df[column].str.lower().str.replace(' ', '_')

In [6]:
df.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

In [7]:
# see the vlaues
tc = pd.to_numeric(df.totalcharges, errors = 'coerce')
# convert the values
df.totalcharges = pd.to_numeric(df.totalcharges, errors = 'coerce')

In [8]:
# replace the values
df.totalcharges = df.totalcharges.fillna(0)

In [9]:
df[tc.isnull()]['totalcharges']

488     0.0
753     0.0
936     0.0
1082    0.0
1340    0.0
3331    0.0
3826    0.0
4380    0.0
5218    0.0
6670    0.0
6754    0.0
Name: totalcharges, dtype: float64

In [10]:
# check the churn feature:
df.churn

# convert yes, no into 1, 0 
df.churn = (df.churn == 'yes').astype('int')

# Setting up the validation frameWork

* Perform the train/validation/test split with Scikit-Learn

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
#train_test_split?

In [13]:
df_full_train, df_test = train_test_split(df, test_size = 0.2, shuffle = True )
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, shuffle = True )
df_full_train

len(df_full_train), len(df_val), len(df_test)

(5634, 1409, 1409)

In [14]:
df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

In [15]:
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values


In [16]:
del df_train['churn']
del df_val['churn']
del df_test['churn']

# Exploratory Data Analysis

* Check missing values
* Look at the target variable (churn)
* Look at numerical and categorical variables

In [17]:
# check any missing value in the df.
df_full_train = df_full_train.reset_index(drop = True)

df_full_train.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [18]:
# look at the target variable(churn)
df_full_train.churn.value_counts()

churn
0    4119
1    1515
Name: count, dtype: int64

In [19]:
# print the percentage of customer churn
df_full_train.churn.value_counts(normalize = True)

churn
0    0.731097
1    0.268903
Name: proportion, dtype: float64

In [20]:
# churn Rate(in training data)
global_churn_rate = df_full_train.churn.mean()
global_churn_rate

0.2689030883919063

In [21]:
# churn rate(complete data)
df.churn.mean()

0.2653698707936959

In [22]:
df_full_train.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [23]:
numerical = ['tenure', 'monthlycharges','totalcharges']

categorical = [ 'gender', 'seniorcitizen', 'partner', 'dependents', 
        'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']

In [24]:
df_full_train[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

# Feature importance: Churn rate and risk ratio

Feature importance analysis (part of EDA) - identifying which features affect our target variable

* Churn rate
* Risk ratio
* Mutual information - later

### Churn Rate

In [25]:
df_full_train.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,9565-axsmr,male,0,yes,yes,52,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.2,1054.75,0
1,1334-fjsvr,male,0,no,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,24.25,24.25,1
2,3714-jtvov,female,1,yes,no,42,yes,no,fiber_optic,yes,...,no,no,no,no,month-to-month,yes,credit_card_(automatic),74.15,3229.4,1
3,3162-zjzfu,male,0,yes,yes,53,yes,no,fiber_optic,no,...,no,yes,yes,no,month-to-month,yes,electronic_check,92.55,4779.45,0
4,8294-uimba,female,0,no,no,30,yes,no,fiber_optic,no,...,no,yes,yes,yes,one_year,yes,bank_transfer_(automatic),94.4,2638.1,0


In [26]:
churn_female = df_full_train[df_full_train.gender == 'female'].churn.mean()
churn_female

0.2685755708590069

In [25]:
churn_male = df_full_train[df_full_train.gender == 'male'].churn.mean()
churn_male

0.2675656493967353

In [26]:
global_churn_rate

0.2689030883919063

In [27]:
churn_seniorcitizen_1 = df_full_train[df_full_train.seniorcitizen == 1].churn.mean()
churn_seniorcitizen_1

0.418554476806904

In [28]:
churn_seniorcitizen_0 = df_full_train[df_full_train.seniorcitizen == 0].churn.mean()
churn_seniorcitizen_0

0.23943063522413427

In [29]:
print("difference if not seniorcitizen:",global_churn_rate - churn_seniorcitizen_0)
print("difference if seniorcitizen:",global_churn_rate - churn_seniorcitizen_1)

difference if not seniorcitizen: 0.029472453167772028
difference if seniorcitizen: -0.1496513884149977


In [30]:
churn_no_partner = df_full_train[df_full_train.partner == 'no'].churn.mean()
churn_no_partner

0.3325350667122819

In [31]:
churn_partner = df_full_train[df_full_train.partner == 'yes'].churn.mean()
churn_partner

0.20029509406123203

In [32]:
print("difference if no partener:",global_churn_rate - churn_no_partner)
print("difference if partener:",global_churn_rate - churn_partner)

difference if no partener: -0.06363197832037559
difference if partener: 0.06860799433067427


### Risk Ratio

In [33]:
print(churn_female/global_churn_rate)
print(churn_male/global_churn_rate)

NameError: name 'churn_female' is not defined

In [None]:
print(churn_male/global_churn_rate)

In [None]:
print(churn_seniorcitizen_1/global_churn_rate)
print(churn_seniorcitizen_0/global_churn_rate)

In [None]:
print(churn_no_partner/global_churn_rate)
print(churn_partner/global_churn_rate)

In [34]:
# use can use Ipython for advance analysis:

In [35]:
from IPython.display import display
# impliment above SQL query into the python.

df_group = df_full_train.groupby('gender').churn.agg(['mean', 'count'])
df_group['diff'] = df_group['mean'] - global_churn_rate
df_group['risk'] = df_group['mean']/global_churn_rate
display(df_group)
print()
print()

Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.270241,2816,0.001338,1.004977
male,0.267566,2818,-0.001337,0.995026






In [36]:
# use loop over categorical list

for c in categorical:
    print(c)
    
    df_group = df_full_train.groupby(c).churn.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_churn_rate
    df_group['risk'] = df_group['mean']/global_churn_rate
    display(df_group)
    print()
    print()
    

gender


Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.270241,2816,0.001338,1.004977
male,0.267566,2818,-0.001337,0.995026




seniorcitizen


Unnamed: 0_level_0,mean,count,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.239431,4707,-0.029472,0.890397
1,0.418554,927,0.149651,1.556525




partner


Unnamed: 0_level_0,mean,count,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.332535,2923,0.063632,1.236635
yes,0.200295,2711,-0.068608,0.74486




dependents


Unnamed: 0_level_0,mean,count,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.313459,3975,0.044556,1.165695
yes,0.162146,1659,-0.106757,0.60299




phoneservice


Unnamed: 0_level_0,mean,count,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.278899,545,0.009996,1.037173
yes,0.267833,5089,-0.001071,0.996019




multiplelines


Unnamed: 0_level_0,mean,count,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.247703,2721,-0.0212,0.921161
no_phone_service,0.278899,545,0.009996,1.037173
yes,0.290963,2368,0.02206,1.082036




internetservice


Unnamed: 0_level_0,mean,count,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dsl,0.194516,1933,-0.074387,0.72337
fiber_optic,0.425343,2478,0.15644,1.581771
no,0.069501,1223,-0.199402,0.258462




onlinesecurity


Unnamed: 0_level_0,mean,count,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.42245,2824,0.153547,1.571014
no_internet_service,0.069501,1223,-0.199402,0.258462
yes,0.149338,1587,-0.119565,0.555361




onlinebackup


Unnamed: 0_level_0,mean,count,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.402424,2475,0.133521,1.49654
no_internet_service,0.069501,1223,-0.199402,0.258462
yes,0.224174,1936,-0.04473,0.833659




deviceprotection


Unnamed: 0_level_0,mean,count,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.394411,2505,0.125508,1.466741
no_internet_service,0.069501,1223,-0.199402,0.258462
yes,0.231899,1906,-0.037004,0.86239




techsupport


Unnamed: 0_level_0,mean,count,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.421614,2813,0.152711,1.567903
no_internet_service,0.069501,1223,-0.199402,0.258462
yes,0.152691,1598,-0.116212,0.567829




streamingtv


Unnamed: 0_level_0,mean,count,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.340909,2288,0.072006,1.267777
no_internet_service,0.069501,1223,-0.199402,0.258462
yes,0.306171,2123,0.037267,1.138591




streamingmovies


Unnamed: 0_level_0,mean,count,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.341376,2238,0.072473,1.269514
no_internet_service,0.069501,1223,-0.199402,0.258462
yes,0.306489,2173,0.037586,1.139774




contract


Unnamed: 0_level_0,mean,count,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
month-to-month,0.429763,3118,0.16086,1.598207
one_year,0.116795,1173,-0.152109,0.434337
two_year,0.028295,1343,-0.240608,0.105223




paperlessbilling


Unnamed: 0_level_0,mean,count,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.158862,2285,-0.110041,0.590778
yes,0.343983,3349,0.07508,1.279209




paymentmethod


Unnamed: 0_level_0,mean,count,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bank_transfer_(automatic),0.165979,1211,-0.102925,0.617243
credit_card_(automatic),0.155664,1227,-0.113239,0.578886
electronic_check,0.460251,1912,0.191348,1.711587
mailed_check,0.189252,1284,-0.079651,0.703794






# Feature Importance : Mutual Information(MI)

Mutual information - concept from information theory, it tells us how much we can learn about one variable if we know the value of another

* https://en.wikipedia.org/wiki/Mutual_information

In [37]:
from sklearn.metrics import mutual_info_score

In [38]:
df_full_train.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [39]:
mutual_info_score(df_full_train.churn, df_full_train.contract)

0.09828989756327552

In [40]:
mutual_info_score(df_full_train.churn, df_full_train.monthlycharges)



0.19653572808484765

In [41]:
mutual_info_score(df_full_train.churn, df_full_train.seniorcitizen)

0.010451238963208626

In [42]:
mutual_info_score(df_full_train.churn, df_full_train.tenure)

0.07996998434297578

In [43]:
def mutual_information_churn_score(series):
    return mutual_info_score(series, df_full_train.churn)
    
mutual_info = df_full_train[categorical].apply(mutual_information_churn_score)
mutual_info.sort_values(ascending = False).round(6)

contract            0.098290
onlinesecurity      0.067259
techsupport         0.066255
internetservice     0.058396
onlinebackup        0.048450
paymentmethod       0.046663
deviceprotection    0.045976
streamingmovies     0.034698
streamingtv         0.034693
paperlessbilling    0.022015
dependents          0.012924
partner             0.011237
seniorcitizen       0.010451
multiplelines       0.001098
phoneservice        0.000027
gender              0.000005
dtype: float64

In [44]:
type(df_full_train[categorical])

pandas.core.frame.DataFrame

# Feature importance: Correlation

Correlation coefficient - https://en.wikipedia.org/wiki/Pearson_correlation_coefficient

In [45]:
df_full_train[numerical].corrwith(df_full_train.churn)

tenure           -0.352030
monthlycharges    0.198418
totalcharges     -0.198914
dtype: float64

In [46]:
df_full_train[df_full_train.tenure <= 2].churn.mean()

0.5879828326180258

In [47]:
df_full_train[(df_full_train.tenure >= 2) & (df_full_train.tenure <= 5)].churn.mean()

0.4794069192751236

In [48]:
df_full_train[(df_full_train.tenure >5 ) & (df_full_train.tenure <= 12)].churn.mean()

0.3698630136986301

In [49]:
df_full_train[(df_full_train.tenure > 12) & (df_full_train.tenure <= 20)].churn.mean()

0.31608133086876156

In [50]:
df_full_train[(df_full_train.tenure > 20) & (df_full_train.tenure <= 24)].churn.mean()

0.24714828897338403

In [51]:
df_full_train[(df_full_train.monthlycharges > 20) & (df_full_train.monthlycharges <= 50)].churn.mean()

0.18740515933232169

In [52]:
df_full_train[(df_full_train.tenure > 50) ].churn.mean()

0.09220701963117192

# One-hot encoding

In [53]:
from sklearn.feature_extraction import DictVectorizer

In [54]:
dicts = df_train[categorical + numerical].iloc[:10].to_dict(orient = 'records')

In [55]:
dv = DictVectorizer(sparse = False)
dv.fit(dicts)

In [56]:
dv.transform(dicts)[0:2]

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 5.62000e+01, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 2.70000e+01, 1.56755e+03],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 7.84500e+01, 0.00000e+00,
        0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.0

In [57]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

# Logistic Regression:

In [63]:
xi = np.linspace(-7, 7, 15)
w0 = 4.5
w = np.linspace(0.1, .04, len(xi))

def linear_regression(xi):
    result = w0
    n = len(w)
    for j in range(n):
        result = result + xi[j]*w[j]
    return result

def sigmoid(z):
    return 1/(1+np.exp(-z))

def logistic_regression(xi):
    z = w0
    n = len(w)
    
    for j in range(n):
        z = z + xi[j]*w[j]
    
    result = sigmoid(z)
    return result
    

# Training Logistic Regression:

In [64]:
from sklearn.linear_model import LogisticRegression

In [65]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [66]:
model.intercept_[0]

-0.12897574664574132

In [67]:
model.coef_[0].round(3)

array([ 0.495, -0.234, -0.388, -0.038, -0.089,  0.012, -0.11 , -0.03 ,
       -0.029, -0.099, -0.344,  0.326, -0.11 ,  0.003, -0.214,  0.156,
       -0.07 ,  0.144, -0.11 , -0.162,  0.193, -0.11 , -0.211, -0.246,
        0.119, -0.139,  0.011, -0.031, -0.232,  0.29 , -0.155,  0.156,
       -0.284,  0.289, -0.108, -0.11 ,  0.09 , -0.135, -0.11 ,  0.117,
        0.242, -0.11 , -0.26 , -0.061,  0.   ])

In [68]:
y_pred = model.predict_proba(X_val)[:, 1]

In [69]:
churn_decision = (y_pred >= 0.5)

In [70]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = churn_decision.astype(int)
df_pred['actual'] = y_val

In [71]:
df_pred['correct'] = (df_pred.prediction == df_pred.actual)

In [72]:
df_pred

Unnamed: 0,probability,prediction,actual,correct
0,0.488699,0,1,False
1,0.016134,0,0,True
2,0.610514,1,1,True
3,0.613310,1,1,True
4,0.045845,0,0,True
...,...,...,...,...
1404,0.064659,0,0,True
1405,0.004906,0,0,True
1406,0.057907,0,1,False
1407,0.562008,1,1,True


In [73]:
df_pred.correct.mean()

0.8041163946061036

# Model intrepretation

* Look at the coefficients
* Train a smaller model with fewer features

In [74]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

{'contract=month-to-month': 0.495,
 'contract=one_year': -0.234,
 'contract=two_year': -0.388,
 'dependents=no': -0.038,
 'dependents=yes': -0.089,
 'deviceprotection=no': 0.012,
 'deviceprotection=no_internet_service': -0.11,
 'deviceprotection=yes': -0.03,
 'gender=female': -0.029,
 'gender=male': -0.099,
 'internetservice=dsl': -0.344,
 'internetservice=fiber_optic': 0.326,
 'internetservice=no': -0.11,
 'monthlycharges': 0.003,
 'multiplelines=no': -0.214,
 'multiplelines=no_phone_service': 0.156,
 'multiplelines=yes': -0.07,
 'onlinebackup=no': 0.144,
 'onlinebackup=no_internet_service': -0.11,
 'onlinebackup=yes': -0.162,
 'onlinesecurity=no': 0.193,
 'onlinesecurity=no_internet_service': -0.11,
 'onlinesecurity=yes': -0.211,
 'paperlessbilling=no': -0.246,
 'paperlessbilling=yes': 0.119,
 'partner=no': -0.139,
 'partner=yes': 0.011,
 'paymentmethod=bank_transfer_(automatic)': -0.031,
 'paymentmethod=credit_card_(automatic)': -0.232,
 'paymentmethod=electronic_check': 0.29,
 'pay

In [75]:
small = ['contract', 'tenure', 'monthlycharges']
df_train[small].iloc[:10].to_dict(orient='records')

[{'contract': 'two_year', 'tenure': 54, 'monthlycharges': 46.2},
 {'contract': 'two_year', 'tenure': 72, 'monthlycharges': 99.15},
 {'contract': 'month-to-month', 'tenure': 11, 'monthlycharges': 55.6},
 {'contract': 'one_year', 'tenure': 41, 'monthlycharges': 109.1},
 {'contract': 'two_year', 'tenure': 67, 'monthlycharges': 111.3},
 {'contract': 'one_year', 'tenure': 53, 'monthlycharges': 108.95},
 {'contract': 'two_year', 'tenure': 69, 'monthlycharges': 20.2},
 {'contract': 'month-to-month', 'tenure': 9, 'monthlycharges': 66.25},
 {'contract': 'month-to-month', 'tenure': 1, 'monthlycharges': 69.95},
 {'contract': 'month-to-month', 'tenure': 1, 'monthlycharges': 21.05}]

In [76]:
dicts_train_small = df_train[small].to_dict(orient='records')
dicts_val_small = df_val[small].to_dict(orient='records')

In [77]:
dv_small = DictVectorizer(sparse=False)
dv_small.fit(dicts_train_small)

In [78]:
dv_small.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'monthlycharges', 'tenure'], dtype=object)

In [79]:
X_train_small = dv_small.transform(dicts_train_small)


In [80]:
model_small = LogisticRegression(solver='lbfgs')
model_small.fit(X_train_small, y_train)

In [81]:
w0 = model_small.intercept_[0]
w0

w = model_small.coef_[0]
w.round(3)

dict(zip(dv_small.get_feature_names_out(), w.round(3)))

{'contract=month-to-month': 1.003,
 'contract=one_year': -0.103,
 'contract=two_year': -0.893,
 'monthlycharges': 0.029,
 'tenure': -0.035}

In [82]:
-2.47 + (-0.949) + 30 * 0.027 + 24 * (-0.036)

-3.473

In [83]:
dicts_full_train = df_full_train[categorical + numerical].to_dict(orient='records')

In [84]:
dv = DictVectorizer(sparse=False)
X_full_train = dv.fit_transform(dicts_full_train)

y_full_train = df_full_train.churn.values

In [85]:
model = LogisticRegression(solver='lbfgs')
model.fit(X_full_train, y_full_train)

In [86]:
dicts_test = df_test[categorical + numerical].to_dict(orient='records')

In [87]:
X_test = dv.transform(dicts_test)

In [88]:
y_pred = model.predict_proba(X_test)[:, 1]

In [89]:
churn_decision = (y_pred >= 0.5)

In [90]:
(churn_decision == y_test).mean()

0.7977288857345636

In [91]:
y_test

array([0, 0, 0, ..., 0, 1, 0])

In [92]:
customer = dicts_test[-1]
customer

{'gender': 'female',
 'seniorcitizen': 0,
 'partner': 'yes',
 'dependents': 'yes',
 'phoneservice': 'yes',
 'multiplelines': 'yes',
 'internetservice': 'no',
 'onlinesecurity': 'no_internet_service',
 'onlinebackup': 'no_internet_service',
 'deviceprotection': 'no_internet_service',
 'techsupport': 'no_internet_service',
 'streamingtv': 'no_internet_service',
 'streamingmovies': 'no_internet_service',
 'contract': 'two_year',
 'paperlessbilling': 'no',
 'paymentmethod': 'bank_transfer_(automatic)',
 'tenure': 72,
 'monthlycharges': 23.75,
 'totalcharges': 1679.25}

In [93]:
X_small = dv.transform([customer])

In [94]:
model.predict_proba(X_small)[0, 1]

0.0029619557439673014

In [95]:
y_test[-1]

0