In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import seaborn as sns
from sklearn import set_config
set_config(display='diagram')

%matplotlib inline

In [None]:
# Read the data
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

#### Peparing and cleaning data

In [None]:
# Standarize data formats
df.columns = df.columns.str.lower().str.replace(' ','_')
categCols = df.select_dtypes('object').columns.to_list()

for col in categCols:
    df[col] = df[col].str.lower().str.replace(' ','_')
df.head()

In [None]:
# Correct values and type of variable totalcharges

df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')

# Characters that couldn't be converted to numerical values show as nan
print(f'Null values for column totalcharges: {df.totalcharges.isnull().sum()}')

# Filling missing values with zero
df.totalcharges.fillna(0, inplace=True)
print(f'Null values for column totalcharges after converting to numerical and fillin NAs: {df.totalcharges.isnull().sum()}')

In [None]:
# Make seniorcitizen an object type variable
df.seniorcitizen = df.seniorcitizen.astype(bool).astype(object)

In [None]:
df.head().T

In [None]:
rows = []
for col in df.columns:
    rows.append([col,df[col].dtype, df[col].unique()]) 
pd.DataFrame(rows, columns=['Feature', 'Type', 'Unique Values'])

In [None]:
targetCol = 'churn'
target = df[targetCol]
data = df.drop(columns=[targetCol])

In [None]:
# Getting numerical and categorical columns

from sklearn.compose import make_column_selector as selector

numColSelector = selector(dtype_exclude=object)
ctgColSelector = selector(dtype_include=object)

numericalCols = numColSelector(data)
categoricalCols = ctgColSelector(data)
#del numericalCols[0]
del categoricalCols[0]

In [None]:
print(numericalCols)
print(categoricalCols)

In [None]:
# creating preprocesors

from sklearn.preprocessing import OneHotEncoder, StandardScaler

catPreprocessor = OneHotEncoder(handle_unknown="ignore")
numPreprocessor = StandardScaler()

In [None]:
# Transforming the data

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('one-hot-encoder', catPreprocessor, categoricalCols)],remainder="passthrough")
    #('one-hot-encoder', catPreprocessor, categoricalCols),
    #('standard_scaler', numPreprocessor, numericalCols)])

In [None]:
# creating the model

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

model = make_pipeline(preprocessor, LogisticRegression(max_iter=500))

In [None]:
# Splitting the data

from sklearn.model_selection import train_test_split

allColumns = numericalCols + categoricalCols
dataTrainFull, dataTest, targetTrainFull, targetTest = train_test_split(
    data[allColumns], target, test_size=0.2, random_state=1)

print(len(dataTrainFull), len(dataTest))

In [None]:
# Splitting the data
dataTrain, dataVal, targetTrain, targetVal = train_test_split(
    dataTrainFull, targetTrainFull, test_size=0.25, random_state=1)

print(len(dataTrain), len(dataVal), len(dataTest))

In [None]:
_ = model.fit(dataTrain, targetTrain)
model.predict(dataVal)[:5]

In [None]:
targetVal[:5].values

In [None]:
model.score(dataVal, targetVal)

In [None]:
# Let's use the train full dataset
_ = model.fit(dataTrainFull, targetTrainFull)
model.predict(dataTest)[:5]

In [None]:
model.score(dataTest, targetTest)