In this project we have a telecom company that offers phone and internet services .

Import packages that we need in our project .

In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
pd.set_option('display.max_columns',None)

Read the Dataset .

In [2]:
df = pd.read_csv('Churn.csv')

Get the number of rows in our dataset .

In [3]:
len(df)

7043

In [4]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

Here we convert total charges type od data into numeric type instead of object(type which pandas put it as defult type because we have " " values for missing data ) and then we replace NAN values with 0 .

In [6]:
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')


In [7]:
df[df['TotalCharges'].isnull()][['customerID', 'TotalCharges']]

Unnamed: 0,customerID,TotalCharges
488,4472-LVYGI,
753,3115-CZMZD,
936,5709-LVOEQ,
1082,4367-NUYAO,
1340,1371-DWPAZ,
3331,7644-OMVMY,
3826,3213-VVOLG,
4380,2520-SGTTA,
5218,2923-ARZLG,
6670,4075-WKNIU,


In [8]:
df['TotalCharges']=df['TotalCharges'].fillna(0)

In [9]:
df['TotalCharges'].isnull().sum()

0

Here we edit the names of columns by lowercasing all columns name and  string values in each column (in addition we replace every space by _ )

In [10]:
df.columns=df.columns.str.lower()
df.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [11]:
string_columns = list(df.dtypes[df.dtypes == 'object'].index)
string_columns

['customerid',
 'gender',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod',
 'churn']

In [12]:
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

Here we map the value of yes by 1 and the value of no by 0

In [13]:
df.churn=df.churn.map({'yes': 1, 'no': 0})

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
df_train_full ,df_test = train_test_split(df, test_size=0.2, random_state=42)

In [16]:
df_train_full.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
2142,4223-bkeor,female,0,no,yes,21,yes,no,dsl,yes,no,yes,no,no,yes,one_year,no,mailed_check,64.85,1336.8,0
1623,6035-riiom,female,0,no,no,54,yes,yes,fiber_optic,no,yes,no,no,yes,yes,two_year,yes,bank_transfer_(automatic),97.2,5129.45,0
6074,3797-vtidr,male,0,yes,no,1,no,no_phone_service,dsl,no,no,no,no,no,no,month-to-month,yes,electronic_check,23.45,23.45,1
1362,2568-brgyx,male,0,no,no,4,yes,no,fiber_optic,no,no,no,no,no,no,month-to-month,yes,electronic_check,70.2,237.95,1
6754,2775-sefee,male,0,no,yes,0,yes,yes,dsl,yes,yes,no,yes,no,no,two_year,yes,bank_transfer_(automatic),61.9,0.0,0


Here we split the full train data into train and validation data(0.33 from the full train data ) 

In [17]:
df_train ,df_val = train_test_split(df_train_full, test_size=0.33, random_state=42)

In [18]:
print('The shape of test data :',df_test.shape)
print('The shape of train data :',df_train.shape)
print('The shape of validation  data :',df_val.shape)

The shape of test data : (1409, 21)
The shape of train data : (3774, 21)
The shape of validation  data : (1860, 21)


In [19]:
y_train = df_train.churn.values
y_val = df_val.churn.values

Here we delete churn column from train and validation data 

In [20]:
del df_train['churn']
del df_val['churn']

Exploratory data analysis .

Checking if we have null values in our data set to deal with it .(Train and validation data)

As appear in the result we don't have any missing values to deal with .

In [21]:
df_train_full.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

To know the distribution of Target variable in the data set . Calculate the churn rate (sum of churned customers / number of all customers )

In [22]:
df_train_full.churn.value_counts()

churn
0    4138
1    1496
Name: count, dtype: int64

Here we have imbalanced data set because the distribution of posistive and negative values isn't totally equal.

In [23]:
churn_rate=df_train_full.churn.mean()
print(round(churn_rate,3)*100,'%')

26.6 %


Displit the columns into categorical and numerical .

In [24]:
categorical = ['gender','seniorcitizen','partner','dependents','phoneservice','multiplelines',
               'internetservice','onlinesecurity','onlinebackup','deviceprotection',
               'techsupport','streamingtv','streamingmovies','contract','paperlessbilling','paymentmethod']
numerical = ['tenure','monthlycharges','totalcharges']

know the number of unique values in each categorical column 

In [25]:
df_train_full[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

Risk ratio.

In [26]:
from IPython.display import display
for col in categorical:
    df_group = df_train_full.groupby(by=col).churn.agg(['mean'])
    df_group['diff']=df_group['mean'] - churn_rate
    df_group['rate']=df_group['mean'] / churn_rate
    display(df_group)


Unnamed: 0_level_0,mean,diff,rate
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.270841,0.00531,1.019998
male,0.260478,-0.005053,0.980971


Unnamed: 0_level_0,mean,diff,rate
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.237098,-0.028433,0.892922
1,0.413907,0.148377,1.558793


Unnamed: 0_level_0,mean,diff,rate
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.326446,0.060916,1.229411
yes,0.200733,-0.064798,0.755968


Unnamed: 0_level_0,mean,diff,rate
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.312326,0.046795,1.176233
yes,0.155674,-0.109856,0.586276


Unnamed: 0_level_0,mean,diff,rate
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.253623,-0.011908,0.955156
yes,0.266824,0.001293,1.004871


Unnamed: 0_level_0,mean,diff,rate
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.251397,-0.014134,0.946771
no_phone_service,0.253623,-0.011908,0.955156
yes,0.284105,0.018574,1.069952


Unnamed: 0_level_0,mean,diff,rate
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.191851,-0.073679,0.722521
fiber_optic,0.415558,0.150028,1.56501
no,0.076606,-0.188924,0.288502


Unnamed: 0_level_0,mean,diff,rate
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.416014,0.150484,1.566727
no_internet_service,0.076606,-0.188924,0.288502
yes,0.145342,-0.120189,0.547363


Unnamed: 0_level_0,mean,diff,rate
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.398693,0.133162,1.501494
no_internet_service,0.076606,-0.188924,0.288502
yes,0.216531,-0.048999,0.815467


Unnamed: 0_level_0,mean,diff,rate
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.387706,0.122175,1.460117
no_internet_service,0.076606,-0.188924,0.288502
yes,0.226825,-0.038705,0.854234


Unnamed: 0_level_0,mean,diff,rate
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.413472,0.147941,1.557153
no_internet_service,0.076606,-0.188924,0.288502
yes,0.152855,-0.112676,0.575657


Unnamed: 0_level_0,mean,diff,rate
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.335418,0.069887,1.263197
no_internet_service,0.076606,-0.188924,0.288502
yes,0.298945,0.033415,1.125841


Unnamed: 0_level_0,mean,diff,rate
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.333333,0.067803,1.255348
no_internet_service,0.076606,-0.188924,0.288502
yes,0.30132,0.035789,1.134784


Unnamed: 0_level_0,mean,diff,rate
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.426533,0.161002,1.60634
one_year,0.117987,-0.147544,0.444343
two_year,0.028379,-0.237151,0.106878


Unnamed: 0_level_0,mean,diff,rate
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.16414,-0.10139,0.618159
yes,0.33594,0.070409,1.265164


Unnamed: 0_level_0,mean,diff,rate
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank_transfer_(automatic),0.174475,-0.091056,0.65708
credit_card_(automatic),0.152404,-0.113126,0.573961
electronic_check,0.449921,0.18439,1.69442
mailed_check,0.190328,-0.075203,0.716782


Mutual information (features which have higher MI which have the higher dependancy with target variable )

This function take a series from the data set (like a column ) and it returns the mutual information (measures the amount of information that knowing x gives about y )between that column and target variable ~~.~~
**df_mi** : is a series which has index as (categorical feature names) , Values(mutual information with the target )

In [27]:
from sklearn.metrics import mutual_info_score
def calculate_mi(series):
    return mutual_info_score(series,df_train_full.churn)

def_mi = df_train_full[categorical].apply(calculate_mi)
def_mi=def_mi.sort_values(ascending=False).to_frame(name='MI')

In [28]:
def_mi

Unnamed: 0,MI
contract,0.096652
onlinesecurity,0.063393
techsupport,0.060935
internetservice,0.053313
onlinebackup,0.045424
paymentmethod,0.042861
deviceprotection,0.042007
streamingtv,0.030844
streamingmovies,0.030705
paperlessbilling,0.019077


Correlation coefficient (pearson) . As appear monthly charges has posiive correlation so the more customers pay , the more likely they are to churn .
Tenure has negative correlations so as tenure grows , churn rate goes down .

In [29]:
df_train_full[numerical].corrwith(df_train_full.churn).to_frame(name='corr').sort_values(by='corr', ascending=False)

Unnamed: 0,corr
monthlycharges,0.188574
totalcharges,-0.19337
tenure,-0.344925


Feature Engineering (One hot encoding for categorical variables )

In [30]:
train_dict = df_train[categorical + numerical].to_dict(orient='records')

DictVectorizer is a technique to make one hot encoding it take the data as dictionary and return it as vectors then the vectors put together as rows of one matrix . (it take the dict then it make one hot encoding for categorical data and left the numerical data as it )

In [31]:
from sklearn.feature_extraction import DictVectorizer
dv=DictVectorizer(sparse=False)
dv.fit(train_dict)

Coverting dictionaries into matrix using trasform .

In [32]:
X_train = dv.transform(train_dict)
X_train.shape

(3774, 45)

In [33]:
dv.feature_names_

['contract=month-to-month',
 'contract=one_year',
 'contract=two_year',
 'dependents=no',
 'dependents=yes',
 'deviceprotection=no',
 'deviceprotection=no_internet_service',
 'deviceprotection=yes',
 'gender=female',
 'gender=male',
 'internetservice=dsl',
 'internetservice=fiber_optic',
 'internetservice=no',
 'monthlycharges',
 'multiplelines=no',
 'multiplelines=no_phone_service',
 'multiplelines=yes',
 'onlinebackup=no',
 'onlinebackup=no_internet_service',
 'onlinebackup=yes',
 'onlinesecurity=no',
 'onlinesecurity=no_internet_service',
 'onlinesecurity=yes',
 'paperlessbilling=no',
 'paperlessbilling=yes',
 'partner=no',
 'partner=yes',
 'paymentmethod=bank_transfer_(automatic)',
 'paymentmethod=credit_card_(automatic)',
 'paymentmethod=electronic_check',
 'paymentmethod=mailed_check',
 'phoneservice=no',
 'phoneservice=yes',
 'seniorcitizen',
 'streamingmovies=no',
 'streamingmovies=no_internet_service',
 'streamingmovies=yes',
 'streamingtv=no',
 'streamingtv=no_internet_servic

Logistic regression for classification 

In [34]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', random_state=42)
model.fit(X_train, y_train)

Applying one hot encoding for validation data to test our model 

In [35]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [36]:
y_pred = model.predict_proba(X_val)

Here we have an array with two columns , the first columns tell us the negative probability which is the customer will not churn and the second column tell us the positive probability which us the customer will churn .

In [37]:
y_pred

array([[0.84642965, 0.15357035],
       [0.7256742 , 0.2743258 ],
       [0.56220278, 0.43779722],
       ...,
       [0.3339168 , 0.6660832 ],
       [0.9971192 , 0.0028808 ],
       [0.70289465, 0.29710535]])

Here we just need the second column(positiv(churn) probability)

We call this **soft** prediction because it give us the result as probability not yes /  no .

In [38]:
y_pred=y_pred[:,1]

In [39]:
y_pred

array([0.15357035, 0.2743258 , 0.43779722, ..., 0.6660832 , 0.0028808 ,
       0.29710535])

Having **Hard** prediction which is yes (churn and send promotional emails ) & no(not churn and not send ) , here we specify threshold as 0.5

In [40]:
churn = y_pred>=0.5

In [41]:
churn

array([False, False, False, ...,  True, False, False])

Accuracy

In [42]:
(y_val== churn).mean()

0.7973118279569893

Bias of our model

In [43]:
model.intercept_

array([-0.15395803])

Weights of 45 feature that we have in our model .

In [44]:
model.coef_[0]

array([ 7.15476942e-01, -8.56565494e-02, -7.83778424e-01, -5.28392317e-02,
       -1.01118800e-01, -1.83218775e-03, -1.30849767e-01, -2.12760776e-02,
        4.04450412e-02, -1.94403073e-01, -3.35640015e-01,  3.12531750e-01,
       -1.30849767e-01,  3.43401272e-03, -2.55407977e-01,  1.00405737e-01,
        1.04420863e-03,  3.71045103e-02, -1.30849767e-01, -6.02127757e-02,
        1.68954712e-01, -1.30849767e-01, -1.92062977e-01, -2.54908843e-01,
        1.00950811e-01, -6.11283661e-02, -9.28296658e-02, -9.44539054e-02,
       -1.81052346e-01,  1.98711275e-01, -7.71630553e-02,  1.00405737e-01,
       -2.54363768e-01,  9.31880544e-02, -1.69079556e-01, -1.30849767e-01,
        1.45971290e-01, -6.10954899e-02, -1.30849767e-01,  3.79872246e-02,
        1.67161010e-01, -1.30849767e-01, -1.90269275e-01, -5.42703518e-02,
        2.33430686e-04])

To zip each feature with it's weight 

In [45]:
dict(zip(dv.feature_names_, model.coef_[0].round(3)))

{'contract=month-to-month': 0.715,
 'contract=one_year': -0.086,
 'contract=two_year': -0.784,
 'dependents=no': -0.053,
 'dependents=yes': -0.101,
 'deviceprotection=no': -0.002,
 'deviceprotection=no_internet_service': -0.131,
 'deviceprotection=yes': -0.021,
 'gender=female': 0.04,
 'gender=male': -0.194,
 'internetservice=dsl': -0.336,
 'internetservice=fiber_optic': 0.313,
 'internetservice=no': -0.131,
 'monthlycharges': 0.003,
 'multiplelines=no': -0.255,
 'multiplelines=no_phone_service': 0.1,
 'multiplelines=yes': 0.001,
 'onlinebackup=no': 0.037,
 'onlinebackup=no_internet_service': -0.131,
 'onlinebackup=yes': -0.06,
 'onlinesecurity=no': 0.169,
 'onlinesecurity=no_internet_service': -0.131,
 'onlinesecurity=yes': -0.192,
 'paperlessbilling=no': -0.255,
 'paperlessbilling=yes': 0.101,
 'partner=no': -0.061,
 'partner=yes': -0.093,
 'paymentmethod=bank_transfer_(automatic)': -0.094,
 'paymentmethod=credit_card_(automatic)': -0.181,
 'paymentmethod=electronic_check': 0.199,
 '

Test data

In [46]:
customer = {
    'customerid': '8879-zkjof',
    'gender': 'female',
    'seniorcitizen': 0,
    'partner': 'no',
    'dependents': 'no',
    'tenure': 41,
    'phoneservice': 'yes',
    'multiplelines': 'no',
    'internetservice': 'dsl',
    'onlinesecurity': 'yes',
    'onlinebackup': 'no',
    'deviceprotection': 'yes',
    'techsupport': 'yes',
    'streamingtv': 'yes',
    'streamingmovies': 'yes',
    'contract': 'one_year',
    'paperlessbilling': 'yes',
    'paymentmethod': 'bank_transfer_(automatic)',
    'monthlycharges': 79.85,
    'totalcharges': 3320.75,
}


In [47]:
X_test = dv.transform([customer])


In [49]:
X_test

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 7.98500e+01, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00,
        1.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 4.10000e+01, 3.32075e+03]])

In [52]:
y_test = model.predict_proba(X_test)[:,1]
y_test


array([0.0751385])

In [53]:
customer = {
    'gender': 'female',
    'seniorcitizen': 1,
    'partner': 'no',
    'dependents': 'no',
    'phoneservice': 'yes',
    'multiplelines': 'yes',
    'internetservice': 'fiber_optic',
    'onlinesecurity': 'no',
    'onlinebackup': 'no',
    'deviceprotection': 'no',
    'techsupport': 'no',
    'streamingtv': 'yes',
    'streamingmovies': 'no',
    'contract': 'month-to-month',
    'paperlessbilling': 'yes',
    'paymentmethod': 'electronic_check',
    'tenure': 1,
    'monthlycharges': 85.7,
    'totalcharges': 85.7
}


In [54]:
X_test = dv.transform([customer])


In [57]:
X_test

array([[ 1. ,  0. ,  0. ,  1. ,  0. ,  1. ,  0. ,  0. ,  1. ,  0. ,  0. ,
         1. ,  0. , 85.7,  0. ,  0. ,  1. ,  1. ,  0. ,  0. ,  1. ,  0. ,
         0. ,  0. ,  1. ,  1. ,  0. ,  0. ,  0. ,  1. ,  0. ,  0. ,  1. ,
         1. ,  1. ,  0. ,  0. ,  0. ,  0. ,  1. ,  1. ,  0. ,  0. ,  1. ,
        85.7]])

In [58]:
model.predict_proba(X_test)[:,1]

array([0.8085141])