## Impporting usefull libraries and data

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
url = 'https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv'
!wget $url

--2024-10-14 06:27:39--  https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘WA_Fn-UseC_-Telco-Customer-Churn.csv.1’

WA_Fn-UseC_-Telco-C     [ <=>                ]   1.72M  --.-KB/s    in 0.1s    

2024-10-14 06:27:40 (15.4 MB/s) - ‘WA_Fn-UseC_-Telco-Customer-Churn.csv.1’ saved [1807957]



In [None]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

## Looking at the data

In [None]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
catgorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
for c in catgorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [None]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [None]:
df.dtypes

Unnamed: 0,0
customerid,object
gender,object
seniorcitizen,int64
partner,object
dependents,object
tenure,int64
phoneservice,object
multiplelines,object
internetservice,object
onlinesecurity,object


In [None]:
total_charges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)
print(total_charges.isnull().sum())
df[total_charges.isnull()][['customerid', 'tenure', 'totalcharges']]

11


Unnamed: 0,customerid,tenure,totalcharges
488,4472-lvygi,0,_
753,3115-czmzd,0,_
936,5709-lvoeq,0,_
1082,4367-nuyao,0,_
1340,1371-dwpaz,0,_
3331,7644-omvmy,0,_
3826,3213-vvolg,0,_
4380,2520-sgtta,0,_
5218,2923-arzlg,0,_
6670,4075-wkniu,0,_


In [None]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)

In [None]:
df.churn = (df.churn == 'yes').astype(int)
df.churn.head()

Unnamed: 0,churn
0,0
1,0
2,1
3,0
4,1


## Setting Up a Validation FrameWork

In [None]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df,            test_size=0.2,  random_state=42)
df_train     , df_val  = train_test_split(df_full_train, test_size=0.25, random_state=42)

len(df_train), len(df_val), len(df_test)

(4225, 1409, 1409)

In [None]:
df_train = df_train.reset_index(drop=True)
df_test  =  df_test.reset_index(drop=True)
df_val   =   df_val.reset_index(drop=True)

y_train = df_train.churn.values
y_test  =  df_test.churn.values
y_val   =   df_val.churn.values

del df_train['churn']
del df_test['churn']
del df_val['churn']

df_full_train = df_full_train.reset_index(drop=True)
df_full_train.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,4223-bkeor,female,0,no,yes,21,yes,no,dsl,yes,...,yes,no,no,yes,one_year,no,mailed_check,64.85,1336.8,0
1,6035-riiom,female,0,no,no,54,yes,yes,fiber_optic,no,...,no,no,yes,yes,two_year,yes,bank_transfer_(automatic),97.2,5129.45,0
2,3797-vtidr,male,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,23.45,23.45,1
3,2568-brgyx,male,0,no,no,4,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.2,237.95,1
4,2775-sefee,male,0,no,yes,0,yes,yes,dsl,yes,...,no,yes,no,no,two_year,yes,bank_transfer_(automatic),61.9,0.0,0


## Exploratory data analysis

Let's Explore our data a bit more.


In [None]:
df_full_train.churn.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
churn,Unnamed: 1_level_1
0,0.734469
1,0.265531


In [None]:
global_churn_rate = df_full_train.churn.mean()
round(global_churn_rate, 2)

0.27

In [None]:
print(len(df_full_train))
df_full_train.isnull().sum()

5634


Unnamed: 0,0
customerid,0
gender,0
seniorcitizen,0
partner,0
dependents,0
tenure,0
phoneservice,0
multiplelines,0
internetservice,0
onlinesecurity,0


Now that we have an undrestanding of the churn, let's look at other variables

In [None]:
numeric = ['monthlycharges', 'tenure', 'totalcharges']
print(df_full_train.columns)
categorical = [
     'gender',
     'seniorcitizen',
     'partner',
     'dependents',
      'phoneservice',
      'multiplelines',
      'internetservice',
      'onlinesecurity',
      'onlinebackup',
      'deviceprotection',
      'techsupport',
      'streamingtv',
      'streamingmovies',
      'contract',
      'paperlessbilling',
      'paymentmethod' ]

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')


In [None]:
df_full_train[categorical].nunique()

Unnamed: 0,0
gender,2
seniorcitizen,2
partner,2
dependents,2
phoneservice,2
multiplelines,3
internetservice,3
onlinesecurity,3
onlinebackup,3
deviceprotection,3


Some categorical variables are really important. Like churn rate within each group

In [None]:
df_full_train[ df_full_train.gender == 'female'].churn.mean()

0.2708409173643975

In [None]:
df_full_train[ df_full_train.gender == 'male'].churn.mean()

0.26047800484932454

In [None]:
global_churn_rate = df_full_train.churn.mean()
global_churn_rate

0.2655307064252751

So churn_by_gender is not very different from global churn.
We can do the same for other variables and their churn
For example we have customers who live with/without partners

In [None]:
df_full_train.partner.value_counts()

Unnamed: 0_level_0,count
partner,Unnamed: 1_level_1
no,2904
yes,2730


## a measure of category importance: difference

In [None]:
churn_partner = df_full_train[ df_full_train.partner == 'yes'].churn.mean()

# it is less than the global_churn by app 6 percent

In [None]:
churn_no_partner = df_full_train[ df_full_train.partner == 'no'].churn.mean()

# It is more than the global churn by 0.6 percent

In [None]:
global_churn_rate - churn_no_partner

-0.06091557456646046

In [None]:
global_churn_rate  - churn_partner

0.06479810569267436

So it does not matter if a customer is a male or a female, the churn rate is app the same. However having a partner affects on the customer decision to stay with the company or not (probably the imitate their partner's company)

So for **feature importance** we can look at the difference of our global_churn_mean and churn_rate_by_category

If the difference is significant and positive, the related group is not likely to churn.
It the difference is significant and negative, this group is likly to have customers who churn

### measures of importance: Risk



In [None]:
round( churn_no_partner / global_churn_rate, 4)

1.2294

The fact that this number is bigger than one and bigger than the other result

In [None]:
round( churn_partner / global_churn_rate, 4)

0.756

### Compute risk for every variable systematically

In [None]:
df_group = df_full_train.groupby('gender').churn.agg(['mean','count'])
df_group['diff'] = df_group['mean'] - global_churn_rate
df_group['risk'] = df_group['mean'] / global_churn_rate
df_group

Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.270841,2747,0.00531,1.019998
male,0.260478,2887,-0.005053,0.980971


We can repeat it for each variable and columns

In [None]:
from IPython.display import display

In [None]:
for c in categorical:
    df_group = df_full_train.groupby(c).churn.agg(['mean','count'])
    df_group['diff'] = df_group['mean'] - global_churn_rate
    df_group['risk'] = df_group['mean'] / global_churn_rate
    display(df_group  )
    print()
    print()

Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.270841,2747,0.00531,1.019998
male,0.260478,2887,-0.005053,0.980971






Unnamed: 0_level_0,mean,count,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.237098,4728,-0.028433,0.892922
1,0.413907,906,0.148377,1.558793






Unnamed: 0_level_0,mean,count,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.326446,2904,0.060916,1.229411
yes,0.200733,2730,-0.064798,0.755968






Unnamed: 0_level_0,mean,count,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.312326,3951,0.046795,1.176233
yes,0.155674,1683,-0.109856,0.586276






Unnamed: 0_level_0,mean,count,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.253623,552,-0.011908,0.955156
yes,0.266824,5082,0.001293,1.004871






Unnamed: 0_level_0,mean,count,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.251397,2685,-0.014134,0.946771
no_phone_service,0.253623,552,-0.011908,0.955156
yes,0.284105,2397,0.018574,1.069952






Unnamed: 0_level_0,mean,count,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dsl,0.191851,1939,-0.073679,0.722521
fiber_optic,0.415558,2481,0.150028,1.56501
no,0.076606,1214,-0.188924,0.288502






Unnamed: 0_level_0,mean,count,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.416014,2810,0.150484,1.566727
no_internet_service,0.076606,1214,-0.188924,0.288502
yes,0.145342,1610,-0.120189,0.547363






Unnamed: 0_level_0,mean,count,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.398693,2448,0.133162,1.501494
no_internet_service,0.076606,1214,-0.188924,0.288502
yes,0.216531,1972,-0.048999,0.815467






Unnamed: 0_level_0,mean,count,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.387706,2489,0.122175,1.460117
no_internet_service,0.076606,1214,-0.188924,0.288502
yes,0.226825,1931,-0.038705,0.854234






Unnamed: 0_level_0,mean,count,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.413472,2791,0.147941,1.557153
no_internet_service,0.076606,1214,-0.188924,0.288502
yes,0.152855,1629,-0.112676,0.575657






Unnamed: 0_level_0,mean,count,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.335418,2239,0.069887,1.263197
no_internet_service,0.076606,1214,-0.188924,0.288502
yes,0.298945,2181,0.033415,1.125841






Unnamed: 0_level_0,mean,count,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.333333,2223,0.067803,1.255348
no_internet_service,0.076606,1214,-0.188924,0.288502
yes,0.30132,2197,0.035789,1.134784






Unnamed: 0_level_0,mean,count,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
month-to-month,0.426533,3083,0.161002,1.60634
one_year,0.117987,1212,-0.147544,0.444343
two_year,0.028379,1339,-0.237151,0.106878






Unnamed: 0_level_0,mean,count,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.16414,2309,-0.10139,0.618159
yes,0.33594,3325,0.070409,1.265164






Unnamed: 0_level_0,mean,count,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bank_transfer_(automatic),0.174475,1238,-0.091056,0.65708
credit_card_(automatic),0.152404,1227,-0.113126,0.573961
electronic_check,0.449921,1887,0.18439,1.69442
mailed_check,0.190328,1282,-0.075203,0.716782






It seems that people who has a family to feed, people with partners and with a monthly contract are supler likly to churn

## Feature importance : Mutual information
It is nice to have a way to measure the importance of a feature.
Like here, we have no way of telling if ```contract``` is more important than ```streamingtv```.
But we need a measure that says this in numbers and it is **Mutual Information**
which quantifies a variable by looking at another. Here forexample, mutual information means how well we can learn about ```churn``` if we observe the value of ```contract```. Like how much we learn about ```churn``` if we know a customer has a ```two-month-contract```

In [1]:
from sklearn.metrics import mutual_info_score

In [2]:
mutual_info_score(df_full_train.churn, df_full_train.contract)

NameError: name 'df_full_train' is not defined

In [None]:
mutual_info_score(df_full_train.churn, df_full_train.gender)

So we see that ```gender``` is not very informative because it's mutual information with churn is pretty low, as expected. It still does not tell us the important of each feature alone, but it's relative importance
Now let's do it more systematically. For this purpose we use a function ```apply``` which applies a function to a dataframe. But the function needs to have one input. So we wrap ```mutual_info_score``` in a function.

In [None]:
def mutual_info_churn_score( data):
  return mutual_info_score(data, df_full_train.gender)

In [None]:
df_full_train[categorical].apply(mutual_info_churn_score).sort_values(ascending=False)

So we see that ```contract``` is the most important variable and ```gender``` is the least important

## Feature importance : correlation

In [None]:
df_full_train[numeric].corrwith(df_full_train.churn)

Unnamed: 0,0
monthlycharges,0.188574
tenure,-0.344925
totalcharges,-0.19337


It tells us the more a customer stays with the company ```(tenure)```, the less likely it is to churn. It also tells us the more people will pay, the less likely they will leave. Also tenure and totalcharges are positively correalted. Meaning the more people stay with the company, the more they pay. Also the more the monthlycharges, the more the ```churn_rate```.
It also tells us that ```tenure``` is the most important, then it is ```totalcharges``` then ```monthlycharges```
We can check it with the code

In [None]:
df_full_train[df_full_train.tenure <= 2].churn.mean()

0.5863636363636363

In [None]:
df_full_train[(df_full_train.tenure > 2) & (df_full_train.tenure <= 12)].churn.mean()

0.4064814814814815

In [None]:
df_full_train[df_full_train.tenure > 12].churn.mean()

0.17205957883923986

So for ```tenure```, it is negative correlation

In [None]:
df_full_train[df_full_train.monthlycharges <= 20].churn.mean()

0.0912621359223301

In [None]:
df_full_train[(df_full_train.monthlycharges > 20) &(df_full_train.monthlycharges < 50) ].churn.mean()

0.1910538286580743

In [None]:
df_full_train[df_full_train.monthlycharges > 50].churn.mean()

0.3152488806952857

For ```monthlycharges``` it is positive correlation

## One-Hot Encoding
There are many functions to prepare categorical variables for machine learning by hot-encoding. We use ``` DictVectorizer``` which vectorizes dictionaries

In [None]:
from sklearn.feature_extraction import DictVectorizer    # It vectorizes dictionaries

In [None]:
df_train[['gender', 'contract']].iloc[:10]

Unnamed: 0,gender,contract
0,male,month-to-month
1,female,one_year
2,male,month-to-month
3,male,month-to-month
4,male,two_year
5,male,two_year
6,male,month-to-month
7,male,month-to-month
8,female,one_year
9,male,month-to-month


In [None]:
dicts = df_train[['gender', 'contract']].iloc[:20].to_dict(orient='records')
dicts

[{'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'one_year'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'male', 'contract': 'two_year'},
 {'gender': 'male', 'contract': 'two_year'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'one_year'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'male', 'contract': 'one_year'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'two_year'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'two_year'},
 {'gender': 'female', 'contract': 'month-to-month'}]

In [None]:
dv = DictVectorizer(sparse=False)
dv.fit(dicts)

In [None]:
dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'gender=female', 'gender=male'], dtype=object)

In [None]:
dv.transform(dicts)

array([[1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1.],
       [0., 0., 1., 0., 1.],
       [0., 0., 1., 0., 1.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1.],
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1.],
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 0.]])

Now that we know how to implement one-hot-encoding, let's do it in a general format and for all the categorical variables.

In [None]:
train_dicts = df_train[categorical + numeric].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv = dv.fit(train_dicts)
X_train = dv.transform(train_dicts)   # or fit_transform

In [None]:
train_dicts[0]

{'gender': 'male',
 'seniorcitizen': 0,
 'partner': 'no',
 'dependents': 'no',
 'phoneservice': 'yes',
 'multiplelines': 'no',
 'internetservice': 'no',
 'onlinesecurity': 'no_internet_service',
 'onlinebackup': 'no_internet_service',
 'deviceprotection': 'no_internet_service',
 'techsupport': 'no_internet_service',
 'streamingtv': 'no_internet_service',
 'streamingmovies': 'no_internet_service',
 'contract': 'month-to-month',
 'paperlessbilling': 'no',
 'paymentmethod': 'mailed_check',
 'monthlycharges': 19.85,
 'tenure': 3,
 'totalcharges': 64.55}

In [None]:
val_dict = df_val[categorical + numeric].to_dict(orient='records')
X_val = dv.transform(val_dict)

## Training Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train, y_train)

In [None]:
model.coef_.round(3) # w or weights

array([[ 0.63 , -0.16 , -0.615, -0.054, -0.091,  0.027, -0.132, -0.04 ,
         0.015, -0.16 , -0.327,  0.314, -0.132,  0.003, -0.225,  0.124,
        -0.044,  0.076, -0.132, -0.089,  0.205, -0.132, -0.217, -0.241,
         0.096, -0.076, -0.069, -0.107, -0.186,  0.211, -0.064,  0.124,
        -0.269,  0.163, -0.139, -0.132,  0.126, -0.059, -0.132,  0.046,
         0.16 , -0.132, -0.173, -0.055,  0.   ]])

In [None]:
model.intercept_ # the bias

array([-0.14501424])

In [None]:
model.predict(X_train[:20])   # Hard predictinos because we already have the answer

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [None]:
print(model.predict_proba(X_val).shape)
# first col: the probability of being in the negative class
# second col: the probability of being in the positive class
y_pred = model.predict_proba(X_val)[:, 1]
y_pred # Soft predictions

(1409, 2)


array([0.16720183, 0.25313349, 0.4356594 , ..., 0.71346573, 0.05055761,
       0.03602381])

In [None]:
churn_dicision = (y_pred >= 0.5 ) # True means they churn

In [None]:
df_val[churn_dicision].customerid  # These are the people who will recieve promotional email

Unnamed: 0,customerid
3,0337-cnpze
10,2038-oeqzh
11,9846-gkxas
12,8051-hjrlt
13,8990-zxlsu
...,...
1395,2809-zmyoq
1396,3536-iqctx
1400,5196-sgoak
1401,3050-gbush


Let's see how accurate the model is by measuring how many of predictions match the validation set

In [None]:
y_val

array([0, 0, 1, ..., 1, 0, 0])

In [None]:
churn_dicision.astype(int)

array([0, 0, 0, ..., 1, 0, 0])

In [None]:
(y_val == churn_dicision).mean()

0.8034066713981547

## Model Interpretation
We want to see what is the weight of each feature

In [None]:
dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'gender=female', 'gender=male',
       'internetservice=dsl', 'internetservice=fiber_optic',
       'internetservice=no', 'monthlycharges', 'multiplelines=no',
       'multiplelines=no_phone_service', 'multiplelines=yes',
       'onlinebackup=no', 'onlinebackup=no_internet_service',
       'onlinebackup=yes', 'onlinesecurity=no',
       'onlinesecurity=no_internet_service', 'onlinesecurity=yes',
       'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no',
       'partner=yes', 'paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check',
       'phoneservice=no', 'phoneservice=yes', 'seniorcitizen',
       'streamingmovies=no', 'streamingmovies=no_internet_service',

In [None]:
model.coef_.round(3) # w or weights

array([[ 0.63 , -0.16 , -0.615, -0.054, -0.091,  0.027, -0.132, -0.04 ,
         0.015, -0.16 , -0.327,  0.314, -0.132,  0.003, -0.225,  0.124,
        -0.044,  0.076, -0.132, -0.089,  0.205, -0.132, -0.217, -0.241,
         0.096, -0.076, -0.069, -0.107, -0.186,  0.211, -0.064,  0.124,
        -0.269,  0.163, -0.139, -0.132,  0.126, -0.059, -0.132,  0.046,
         0.16 , -0.132, -0.173, -0.055,  0.   ]])

We need to join the above two variables

```dv.get_feature_names_out()```  and ```model.coef```

using ```zip```


In [None]:
# Introducing zip function
a = [1, 2, 4, 5]
b = 'abcd'

print(list(zip(a, b)))
print(dict(zip(a, b)))

[(1, 'a'), (2, 'b'), (4, 'c'), (5, 'd')]
{1: 'a', 2: 'b', 4: 'c', 5: 'd'}


In [None]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

{'contract=month-to-month': 0.63,
 'contract=one_year': -0.16,
 'contract=two_year': -0.615,
 'dependents=no': -0.054,
 'dependents=yes': -0.091,
 'deviceprotection=no': 0.027,
 'deviceprotection=no_internet_service': -0.132,
 'deviceprotection=yes': -0.04,
 'gender=female': 0.015,
 'gender=male': -0.16,
 'internetservice=dsl': -0.327,
 'internetservice=fiber_optic': 0.314,
 'internetservice=no': -0.132,
 'monthlycharges': 0.003,
 'multiplelines=no': -0.225,
 'multiplelines=no_phone_service': 0.124,
 'multiplelines=yes': -0.044,
 'onlinebackup=no': 0.076,
 'onlinebackup=no_internet_service': -0.132,
 'onlinebackup=yes': -0.089,
 'onlinesecurity=no': 0.205,
 'onlinesecurity=no_internet_service': -0.132,
 'onlinesecurity=yes': -0.217,
 'paperlessbilling=no': -0.241,
 'paperlessbilling=yes': 0.096,
 'partner=no': -0.076,
 'partner=yes': -0.069,
 'paymentmethod=bank_transfer_(automatic)': -0.107,
 'paymentmethod=credit_card_(automatic)': -0.186,
 'paymentmethod=electronic_check': 0.211,
 '