In [1]:
#importing all libraries (pandas,numpy and scikit learn)
import numpy as np
import pandas as pd
import sklearn as sk

In [2]:
#read input file as dataframe
df = pd.read_csv("churn.csv")

In [3]:
#set data view options to see the entire dataset 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows',None)
df.head(4)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No


In [5]:
#check data types of all columns
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [7]:
#since column TotalCharge is object type as the numerical values are stored as string, convert the values to numeric values
df['TotalCharges']=pd.to_numeric(df['TotalCharges'], errors='coerce')

In [8]:
#check for null values in the data frame.
df.isnull().sum()
#There are 11 null values in column TotalCharge


customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [9]:
#replace nan with average Total charge 
df['TotalCharges']=df['TotalCharges'].fillna(df['TotalCharges'].mean()).round(3)

In [36]:
#Checking datatypes again, total charges type should change 
df.dtypes

gender                                     int64
SeniorCitizen                              int64
Partner                                    int64
Dependents                                 int64
tenure                                     int64
PhoneService                               int64
PaperlessBilling                           int64
MonthlyCharges                           float64
TotalCharges                             float64
Churn                                      int64
MultipleLines_No phone service             uint8
MultipleLines_Yes                          uint8
InternetService_Fiber optic                uint8
InternetService_No                         uint8
OnlineSecurity_No internet service         uint8
OnlineSecurity_Yes                         uint8
OnlineBackup_No internet service           uint8
OnlineBackup_Yes                           uint8
DeviceProtection_No internet service       uint8
DeviceProtection_Yes                       uint8
TechSupport_No inter

In [11]:
#Drop column customer id as it has no importance for our model
df.drop(columns=['customerID'],inplace=True)

In [12]:
#check the unique values in every column to find out which of those are nominal and which are ordinal categorical columns for encoding 
df.MultipleLines.unique()
df.	InternetService.unique()
df.OnlineSecurity.unique()
df.OnlineBackup.unique()
df.DeviceProtection.unique()
df.TechSupport.unique()
df.StreamingTV.unique()
df.StreamingMovies.unique()
df.Contract.unique()
df.PaperlessBilling.unique()
df.PaymentMethod.unique()
df.PhoneService.unique()

array(['No phone service', 'No', 'Yes'], dtype=object)

In [24]:
#convert nominal variable categorical columns with just 2 categories to numbers  
df['gender']=df.gender.apply(lambda x : 0 if x=='Male' else 1)
df['Partner']=df.Partner.apply(lambda x :0 if x=='No' else 1)
df['Dependents']=df.Dependents.apply(lambda x : 0 if x=='No' else 1)
df['PhoneService']=df.PhoneService.apply(lambda x: 0 if x=='No' else 1)
df['PaperlessBilling']=df.PaperlessBilling.apply(lambda x: 0 if x=='No' else 1)
df['Churn']=df.Churn.apply(lambda x:0 if x=='No' else 1)

In [26]:
#make dummy of nominal categorical variables with more than 2 categories
df=pd.get_dummies(df,columns=['MultipleLines','InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaymentMethod'],drop_first=True)

In [27]:
#separating independent and dependent variables
x = df.drop(labels = ["Churn"],axis = 1)
y = df['Churn'].values


array([0, 0, 1, ..., 0, 1, 0], dtype=int64)

In [28]:
#Train Test split for training model for prediction
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.25,random_state=0)

In [30]:
#fitting data with logistic regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [31]:
#predicting churn or no churn 
y_pred= model.predict(x_test)

In [32]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

In [35]:
# Print the prediction accuracy
from sklearn import metrics
print (metrics.accuracy_score(y_test, y_pred))

0.797274275979557
