In [213]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px 
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline

data_df = pd.read_csv("data/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [214]:
def dataoveriew(df):
    print("Overview of data set:\n")
    print('Number of rows: ', df.shape[0])
    print("\nNumber of features:", df.shape[1])
    print("\nData Features:")
    print(df.columns.tolist())
    print("\nMissing values:", df.isnull().sum().values.sum())
    print("\nUnique values:")
    print(df.nunique())

In [215]:
#Necessary Preprocessing steps before visualization. Combining users that dont have internet service with customers that didnt subscribe to a particular service.
replace_dict = {'No internet service': 'No',
                'No phone service': 'No'}

# Replace values in all columns using the replace() method
data_df.replace(replace_dict, inplace=True)

data_df = data_df.drop(data_df[data_df['InternetService'] == 'No'].index)

dataoveriew(data_df)

Overview of data set:

Number of rows:  5517

Number of features: 21

Data Features:
['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']

Missing values: 0

Unique values:
customerID          5517
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          2
InternetService        2
OnlineSecurity         2
OnlineBackup           2
DeviceProtection       2
TechSupport            2
StreamingTV            2
StreamingMovies        2
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1494
TotalCharges        5259
Churn                  2
dtype: int64


-----------------------------------------------------
<h1>Visually exploring the data set</h1>

In [216]:
def bar(feature):
    temp_df = data_df.groupby([feature, "Churn"]).size().reset_index()
    temp_df = temp_df.rename({0: "count"}, axis=1)
    fig = px.bar(temp_df, 
                 y="count", 
                 x=feature, 
                 color="Churn", 
                 barmode='group', 
                 color_discrete_sequence=['Green', 'Red'],
                 title=f"Analysis of {feature}",
                 height=400,
                 width=700)
    fig.show()


In [217]:
data_df.loc[data_df["SeniorCitizen"]==0, "SeniorCitizen"] = "No"
data_df.loc[data_df["SeniorCitizen"]==1, "SeniorCitizen"] = "Yes"



Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value 'No' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.



<h3>Demographic Visualization</h3>

In [218]:
target_instance = data_df["Churn"].value_counts().to_frame().reset_index()
print(target_instance)
fig = px.pie(target_instance, values='count', names='Churn', color_discrete_sequence=["green", "red"],
             title='Distribution of Churn')
fig.show()

  Churn  count
0    No   3761
1   Yes   1756


In [219]:
fig1 = bar("gender")
fig2 = bar("SeniorCitizen")
fig3 = bar("Partner")
fig4 = bar("Dependents")


<h3>Insight on Demographic charts: </h3>

For male and female customers, the proportion of churn is slightly higher in females, but the slight difference can be ignored.
<br>Our analysis also shows that a higher proportion of younger citizens are more likely to churn compared to seniors. The same case is true for citizens without a partner. <br>Citizens without dependents are also more likely to churn as seen in the chart above.
    
---------------------------------------------------------------------------------------------------------

<h3>Now for the analysis on customers based on the services they signed up for: </h3>

In [220]:
bar('PhoneService')
bar('MultipleLines')
bar('InternetService')
bar('OnlineSecurity')
bar('OnlineBackup')
bar('DeviceProtection')
bar('TechSupport')
bar('StreamingTV')
bar('StreamingMovies')

<h3>Insight on customer services charts: </h3>
        We can see from the charts that customers that subscribed to a phone service are more likely to churn. 
    While the customers that subscribed to multiple phone lines and those that didn't have similar rates of churning, so this might not be a good predictor of churn. <br><br>
    Customers that are subscribed to a fiber optic internet service are also more likely to churn; this could be a consequence of the higher price rates for fiber optic internet. <br><br>
    Customers that <b>didn't</b> subscribe to online security services, online backup services, device protection services, or tech support services are also more likely to churn. <br><br>

-----------------------------------------------------------------------------------------------------------------------------

<h3>Analysis on Payment methods:</h3>

In [221]:
bar("PaperlessBilling")
bar("PaymentMethod")
bar("Contract")

<h3>Insight on the payment information charts</h3>

The charts let us know that customers with paperless billing are more likely to churn than customers without. <br><br>
And as for the payment methods, the churn rate among all payment records is relatively low, <b>except</b> customers that use electronic checks to pay for services. <br><br>
And as for contract length, the chart shows higher churn rates with customers that opt for month-to-month payments, and very low churn rates for customers that have opted for more extended and lower payment frequency. Noting this, companies might want to consider encouraging customers to build longer term relationships with them.

------------------------------------------------------------------------------------------------------------

<h3>Now for analysis on numerical features</h3>

In [222]:
data_df.TotalCharges = pd.to_numeric(data_df.TotalCharges, errors="coerce")
data_df.TotalCharges = data_df.TotalCharges.fillna(data_df.TotalCharges.median())

In [223]:
def hist(feature):
    group_df = data_df.groupby([feature, 'Churn']).size().reset_index()
    group_df = group_df.rename(columns={0: 'Count'})
    fig = px.histogram(group_df, x=feature, y='Count', color='Churn', marginal='box', title=f'Churn rate frequency to {feature} distribution', color_discrete_sequence=["green", "red"])
    fig.show()

In [224]:
hist("tenure")
hist("MonthlyCharges")
hist("TotalCharges")

<h3>Insight on the numerical charts:</h3>
    For the tenure histogram, it is observed that the majority of customers that left did so in the first 0-9 months of being with the company.<br>
    And for the monthly charges histogram, the higher the charge, the more the customers that left the company. A better solution to retaining customers, therefore, would be customer discounts or loyalty benefits.<br>
    
-------------------------------------

<h3>Data preprocessing: </h3>

In [225]:
data_df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,No,Yes,No,1,No,No,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,No,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,No,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,No,No,No,45,No,No,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,No,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [226]:
data_df = data_df.drop("customerID", axis=1)

def binaryMap(feature):
    return feature.map({"Yes": 1, "No":0})

binary_features_list = ['SeniorCitizen', 'Partner', 'Dependents','PhoneService', 'MultipleLines','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','PaperlessBilling', 'Churn']
data_df[binary_features_list] = data_df[binary_features_list].apply(binaryMap)
data_df['gender'] = data_df['gender'].map({'Female':0, 'Male':1})


data_df = pd.get_dummies(data_df, dtype=int)

#Normalizing numerical features
sc = MinMaxScaler()
#sc2 = MinMaxScaler()
#sc3 = MinMaxScaler()

data_df[['tenure', 'MonthlyCharges', 'TotalCharges']] = sc.fit_transform(data_df[['tenure', 'MonthlyCharges', 'TotalCharges']])


In [227]:
corr = data_df.corr()
fig = px.imshow(corr, height=1000, width=1000)
fig.show()

Training with different models

In [228]:
# Import Machine learning algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

#Import metric for performance evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#Split data into train and test sets
from sklearn.model_selection import train_test_split
X = data_df.drop('Churn', axis=1)
y = data_df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

#Defining the modelling function
def modeling(alg, alg_name, params={}):

    model = alg(**params) #Instantiating the algorithm class and unpacking parameters if any
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    #Performance evaluation
    def print_scores(alg, y_test, y_pred):
        print(alg_name)
        acc_score = accuracy_score(y_test, y_pred)
        print("accuracy: ",acc_score)
        pre_score = precision_score(y_test, y_pred)
        print("precision: ",pre_score)
        rec_score = recall_score(y_test, y_pred)
        print("recall: ",rec_score)
        f_score = f1_score(y_test, y_pred, average='weighted')
        print("f1_score: ",f_score, "\n\n")

    print_scores(alg, y_test, y_pred)
    return model

# Running logistic regression model
log_model = modeling(LogisticRegression, "Logistic Regression")

Logistic Regression
accuracy:  0.7608695652173914
precision:  0.6148936170212767
recall:  0.5734126984126984
f1_score:  0.7584345892372283 




Previous cell was a test run. <br>
Now it's time for feature selection using Recursive Feature Elimination

In [229]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
log = LogisticRegression()
rfe = RFECV(estimator=log, cv=StratifiedKFold(10, random_state=50, shuffle=True), scoring="accuracy")
rfe.fit(X, y)
print(f"Optimum number of features is : {rfe.n_features_}")

Optimum number of features is : 17


In [230]:
X_rfe = X.iloc[:, rfe.support_]
X_rfe.columns.tolist()

['SeniorCitizen',
 'Dependents',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'OnlineSecurity',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'PaperlessBilling',
 'MonthlyCharges',
 'TotalCharges',
 'InternetService_DSL',
 'InternetService_Fiber optic',
 'Contract_Month-to-month',
 'Contract_Two year',
 'PaymentMethod_Electronic check']

In [231]:
X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.3)

In [232]:
#Logistic Regression
modeling(LogisticRegression, "Logistic Regression")

Logistic Regression
accuracy:  0.7608695652173914
precision:  0.6399082568807339
recall:  0.5386100386100386
f1_score:  0.7547503793832595 




In [233]:
modeling(SVC, "Support Vector Classification")

Support Vector Classification
accuracy:  0.7475845410628019
precision:  0.6201923076923077
recall:  0.4980694980694981
f1_score:  0.7392151359079605 




In [234]:
modeling(RandomForestClassifier, "Random Forest")

Random Forest
accuracy:  0.7421497584541062
precision:  0.6096385542168675
recall:  0.48841698841698844
f1_score:  0.733498745172169 




In [235]:
modeling(DecisionTreeClassifier, "Decision Tree")

Decision Tree
accuracy:  0.6733091787439613
precision:  0.4772277227722772
recall:  0.46525096525096526
f1_score:  0.6721611240764064 




In [236]:
modeling(GaussianNB, "Gaussian Naive Bayes")

Gaussian Naive Bayes
accuracy:  0.7131642512077294
precision:  0.5276705276705277
recall:  0.7915057915057915
f1_score:  0.7234317480162579 




The best performing model: Logistic Regression model (According to my results, by the way). <br> But it could be better. So we are going to improve it by hyperparameter tuning.

In [237]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold, RandomizedSearchCV
from scipy.stats import loguniform

# List of solvers for logistic regression models
solvers = ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']

# Dictionary mapping each solver to its compatible penalties
solver_penalties = {
    'liblinear': ['l1', 'l2'],
    'newton-cg': ['l2'],
    'lbfgs': ['l2'],
    'sag': ['l2'],
    'saga': ['l1', 'l2', 'elasticnet']
}

# Get compatible penalties for all solvers
compatible_penalties = list(set.intersection(*[set(penalties) for penalties in solver_penalties.values()]))

# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# define search space
space = {
    'solver': solvers,
    'penalty': compatible_penalties,
    'C': loguniform(1e-5, 1000)
}

# define search
model = LogisticRegression()
search = RandomizedSearchCV(model, space, n_iter=500, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)

# execute search
result = search.fit(X_rfe, y)

# summarize result
params = result.best_params_
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % params)


Best Score: 0.7718523413731729
Best Hyperparameters: {'C': 8.752228362092895, 'penalty': 'l2', 'solver': 'saga'}


In [238]:
print(params)

{'C': 8.752228362092895, 'penalty': 'l2', 'solver': 'saga'}


In [239]:
log_model = modeling(LogisticRegression, "Logistic Regression",params=params)

Logistic Regression
accuracy:  0.7614734299516909
precision:  0.6394557823129252
recall:  0.5444015444015444
f1_score:  0.7558003391852507 





The max_iter was reached which means the coef_ did not converge



In [240]:
import joblib

model = "pickle_files/Logmodel.save"
scaler = "pickle_files/scaler.save"
joblib.dump(log_model, model)
joblib.dump(sc, scaler)

['pickle_files/scaler.save']