In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import seaborn as sns
from sklearn import set_config
set_config(display='diagram')

%matplotlib inline

In [2]:
# Read the data
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


#### Peparing and cleaning data

In [3]:
# Standarize data formats
df.columns = df.columns.str.lower().str.replace(' ','_')
categCols = df.select_dtypes('object').columns.to_list()

for col in categCols:
    df[col] = df[col].str.lower().str.replace(' ','_')
df.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,no
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,no
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,yes
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,...,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.3,1840.75,no
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.7,151.65,yes


In [4]:
# Correct values and type of variable totalcharges

df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')

# Characters that couldn't be converted to numerical values show as nan
print(f'Null values for column totalcharges: {df.totalcharges.isnull().sum()}')

# Filling missing values with zero
df.totalcharges.fillna(0, inplace=True)
print(f'Null values for column totalcharges after converting to numerical and fillin NAs: {df.totalcharges.isnull().sum()}')

Null values for column totalcharges: 11
Null values for column totalcharges after converting to numerical and fillin NAs: 0


In [6]:
# Make seniorcitizen an object type variable
df.seniorcitizen = df.seniorcitizen.astype(bool).astype(object)

In [7]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,False,False,False,False,False
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [8]:
rows = []
for col in df.columns:
    rows.append([col,df[col].dtype, df[col].unique()]) 
pd.DataFrame(rows, columns=['Feature', 'Type', 'Unique Values'])

Unnamed: 0,Feature,Type,Unique Values
0,customerid,object,"[7590-vhveg, 5575-gnvde, 3668-qpybk, 7795-cfoc..."
1,gender,object,"[female, male]"
2,seniorcitizen,object,"[False, True]"
3,partner,object,"[yes, no]"
4,dependents,object,"[no, yes]"
5,tenure,int64,"[1, 34, 2, 45, 8, 22, 10, 28, 62, 13, 16, 58, ..."
6,phoneservice,object,"[no, yes]"
7,multiplelines,object,"[no_phone_service, no, yes]"
8,internetservice,object,"[dsl, fiber_optic, no]"
9,onlinesecurity,object,"[no, yes, no_internet_service]"


In [9]:
targetCol = 'churn'
target = df[targetCol]
data = df.drop(columns=[targetCol])

In [10]:
# Getting numerical and categorical columns

from sklearn.compose import make_column_selector as selector

numColSelector = selector(dtype_exclude=object)
ctgColSelector = selector(dtype_include=object)

numericalCols = numColSelector(data)
categoricalCols = ctgColSelector(data)
#del numericalCols[0]
del categoricalCols[0]

In [11]:
print(numericalCols)
print(categoricalCols)

['tenure', 'monthlycharges', 'totalcharges']
['gender', 'seniorcitizen', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod']


In [12]:
# creating preprocesors

from sklearn.preprocessing import OneHotEncoder, StandardScaler

catPreprocessor = OneHotEncoder(handle_unknown="ignore")
numPreprocessor = StandardScaler()

In [13]:
# Transforming the data

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('one-hot-encoder', catPreprocessor, categoricalCols)],remainder="passthrough")
    #('one-hot-encoder', catPreprocessor, categoricalCols),
    #('standard_scaler', numPreprocessor, numericalCols)])

In [14]:
# creating the model

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

model = make_pipeline(preprocessor, LogisticRegression(max_iter=500))

In [15]:
# Splitting the data

from sklearn.model_selection import train_test_split

allColumns = numericalCols + categoricalCols
dataTrainFull, dataTest, targetTrainFull, targetTest = train_test_split(
    data[allColumns], target, test_size=0.2, random_state=1)

dataTrain, dataVal, targetTrain, targetVal = train_test_split(
    dataTrainFull, targetTrainFull, test_size=0.25, random_state=1)

print(len(dataTrain), len(dataVal), len(dataTest))

4225 1409 1409


In [16]:
_ = model.fit(dataTrain, targetTrain)
display(targetVal[:5].values)
display(model.score(dataVal, targetVal))

array(['no', 'no', 'no', 'yes', 'no'], dtype=object)

0.8034066713981547

In [17]:
# Let's use the train full dataset and calculate AUC
from sklearn.metrics import auc, roc_auc_score

_ = model.fit(dataTrainFull, targetTrainFull)
targetPred = model.predict_proba(dataTest)[:,1]

auc = roc_auc_score(targetTest, targetPred)
auc

0.8585819601980348

In [26]:
user = dataVal.iloc[100].to_dict()
user

{'tenure': 68,
 'monthlycharges': 70.9,
 'totalcharges': 4911.35,
 'gender': 'female',
 'seniorcitizen': False,
 'partner': 'yes',
 'dependents': 'no',
 'phoneservice': 'yes',
 'multiplelines': 'yes',
 'internetservice': 'dsl',
 'onlinesecurity': 'yes',
 'onlinebackup': 'yes',
 'deviceprotection': 'yes',
 'techsupport': 'yes',
 'streamingtv': 'no',
 'streamingmovies': 'no',
 'contract': 'two_year',
 'paperlessbilling': 'no',
 'paymentmethod': 'credit_card_(automatic)'}

In [34]:
# Transform user input data to be inline with model parameters

def transformInput(user):

    u = pd.DataFrame(user, index=[1])
    xu = u.dtypes[u.dtypes != dataTest.dtypes]
    for i in xu.index:
        u[i] = dataTrain[i].dtype
    return u

In [31]:
u = transformInput(user)
model.predict_proba(u)[0][1]

0.007721290001511296

In [45]:
user2 = dataTest.iloc[280].to_dict()
user2

{'tenure': 3,
 'monthlycharges': 80.35,
 'totalcharges': 253.8,
 'gender': 'male',
 'seniorcitizen': False,
 'partner': 'no',
 'dependents': 'no',
 'phoneservice': 'yes',
 'multiplelines': 'no',
 'internetservice': 'fiber_optic',
 'onlinesecurity': 'no',
 'onlinebackup': 'no',
 'deviceprotection': 'no',
 'techsupport': 'no',
 'streamingtv': 'no',
 'streamingmovies': 'yes',
 'contract': 'month-to-month',
 'paperlessbilling': 'yes',
 'paymentmethod': 'electronic_check'}

In [46]:
u = transformInput(user2)
model.predict_proba(u)[0][1]

0.7745051870464377