In [56]:
#import necessary libaries 
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import confusion_matrix, classification_report 
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE

#Load dataset
customers = pd.read_csv('customers.csv')

In [58]:
customers.head()

Unnamed: 0,customerID,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### What features are most likely associated with churn?

In [60]:
#We will answer this by finding the correlation coefficients in regards to churn. But first.

customers = customers.drop(['customerID'], axis=1)

# Convert categorical variables to numerical variables
customers['churn'] = customers['churn'].map({'No': 0, 'Yes': 1})
# customers['phoneservice'] = customers['phoneservice'].map({'No': 0, 'Yes': 1})
# customers['multiplelines'] = customers['multiplelines'].map({'No phone service': 0, 'No': 1, 'Yes': 2})
# customers['internetservice'] = customers['internetservice'].map({'No': 0, 'DSL': 1, 'Fiber optic': 2})
# customers['onlinesecurity'] = customers['onlinesecurity'].map({'No internet service': 0, 'No': 1, 'Yes': 2})
# customers['onlinebackup'] = customers['onlinebackup'].map({'No internet service': 0, 'No': 1, 'Yes': 2})
# customers['deviceprotection'] = customers['deviceprotection'].map({'No internet service': 0, 'No': 1, 'Yes': 2})
# customers['techsupport'] = customers['techsupport'].map({'No internet service': 0, 'No': 1, 'Yes': 2})
# customers['streamingtv'] = customers['streamingtv'].map({'No internet service': 0, 'No': 1, 'Yes': 2})
# customers['streamingmovies'] = customers['streamingmovies'].map({'No internet service': 0, 'No': 1, 'Yes': 2})
# customers['paperlessbilling'] = customers['paperlessbilling'].map({'No': 0, 'Yes': 1})
# customers['paymentmethod'] = customers['paymentmethod'].map({'Electronic check': 0, 'Mailed check': 1, 'Bank transfer (automatic)': 2, 'Credit card (automatic)': 3})

customers_encoded = pd.get_dummies(customers, drop_first=True)

In [68]:
customers_encoded.head()
#customers.head()


Unnamed: 0,seniorcitizen,tenure,monthlycharges,totalcharges,churn,gender_Male,partner_Yes,dependents_Yes,phoneservice_Yes,multiplelines_No phone service,...,streamingtv_No internet service,streamingtv_Yes,streamingmovies_No internet service,streamingmovies_Yes,contract_One year,contract_Two year,paperlessbilling_Yes,paymentmethod_Credit card (automatic),paymentmethod_Electronic check,paymentmethod_Mailed check
0,0,1,29.85,29.85,0,False,True,False,False,True,...,False,False,False,False,False,False,True,False,True,False
1,0,34,56.95,1889.5,0,True,False,False,True,False,...,False,False,False,False,True,False,False,False,False,True
2,0,2,53.85,108.15,1,True,False,False,True,False,...,False,False,False,False,False,False,True,False,False,True
3,0,45,42.3,1840.75,0,True,False,False,False,True,...,False,False,False,False,True,False,False,False,False,False
4,0,2,70.7,151.65,1,False,False,False,True,False,...,False,False,False,False,False,False,True,False,True,False


In [74]:
# Calculate the correlation coefficients
#This is sorted so the highest correlation coeff are on top
correlations = customers_encoded.corr()['churn'].sort_values(ascending=False)

# Print the results
print(correlations)

churn                                    1.000000
internetservice_Fiber optic              0.307463
paymentmethod_Electronic check           0.301455
monthlycharges                           0.192858
paperlessbilling_Yes                     0.191454
seniorcitizen                            0.150541
streamingtv_Yes                          0.063254
streamingmovies_Yes                      0.060860
multiplelines_Yes                        0.040033
phoneservice_Yes                         0.011691
gender_Male                             -0.008545
multiplelines_No phone service          -0.011691
deviceprotection_Yes                    -0.066193
onlinebackup_Yes                        -0.082307
paymentmethod_Mailed check              -0.090773
paymentmethod_Credit card (automatic)   -0.134687
partner_Yes                             -0.149982
dependents_Yes                          -0.163128
techsupport_Yes                         -0.164716
onlinesecurity_Yes                      -0.171270


So, based on this, internetservice, monthlycharges and paperless billing are mostly associated with churn for positive values.
However, 0.3 is moderate and not very strong, therefore, there is absolutely other factors affecting churn which we will explore later.


### Hypothesis Testing to validate monthly charges affecting churn.

In my SQL script, I found that customers with a higher monthly charge, they churned more. So, to solidify this association,
I will perform hypothesis testing to determine if the correlation between monthly charges and churn is due to chance or if there is a statistical significance between the two. We will use Pearson's correlation test to determine this. This test will tell us if there is a linear relationship between the two and if this relationship is indeed significant. The result will be the coefficient and the p-value. The p-value will tell us everything we need to know about our test.

Null Hypothesis: The null hypothesis will be a statement that states that there is no significance between the two variables or simply that the relationship is merely due to chance. Therefore, our null hypothesis is: There is no significant correlation between the two variables(monthly charges and churn).

Alternative Hypothesis: The alternative hypothesis basically does the opposite and states that there is a significance between the two variables and their relationship is not due to chance. Therefore, our alternative hypothesis is: There is a significant correlation between the two variables.

Now as I said earlier, the p-value tells us everything we need to know. Therefore, if the p-value is less than the significance level(0.05), we can reject the null hypothesis. If the p-value is greater than the significance level, we failed to reject the null hypothesis and we would have to accept it.

Let us see what the p value is.

In [15]:
# Load monthly charges and churn into columns
mont_charges = customers['monthlycharges']
churn = customers['churn']

# Perform Pearson correlation test
corr, p_value = pearsonr(mont_charges, churn)

# Print results
print("Correlation coefficient:", corr)
print("P-value:", p_value)


Correlation coefficient: 0.19285821847007886
P-value: 6.760843118056653e-60


This is a very small p-value and therefore, once the p value is less than the significance level of 0.05, then we can reject the null hypothesis and this means that there is a significant relationship between monthlycharges and churn. It is not due to chance.

Therefore, customers with higher monthly charges, are at-risk of churning. I would recommend revising pricing of services to ensure it is reasonable for all customers because customers may be leaving because of high monthly charges. So, lowering prices may retain some customers and reduce the churn rate. 
Also, conduct a customer survey to find out the specifics to see if customers are churning due to high monthly charges because of the price not matching up to the services being offered.

### Let's now predict customer churn using Logistic Regression.

In [76]:
#We are dropping customerID and gender because they're not relevant to our model. And we're dropping totalcharges because
#it is highly correlated to tenure, so keeping it would be redundant and cause multilinearity issues.
to_drop = ['phoneservice_Yes','multiplelines_No phone service','gender_Male',]
customers_encoded = customers_encoded.drop(to_drop,axis=1)

features = customers_encoded.drop('churn', axis=1)
target = customers_encoded['churn']

In [90]:


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

#Increase the number of iterations
logreg = LogisticRegression(max_iter=2300)

# Fit the model to the training data
logreg.fit(X_train, y_train)

# I will now evaluate the model 
y_pred = logreg.predict(X_test) 
#print(logreg.n_iter_)

print(classification_report(y_test, y_pred))

[2254]
              precision    recall  f1-score   support

           0       0.84      0.89      0.86      1549
           1       0.63      0.53      0.58       561

    accuracy                           0.79      2110
   macro avg       0.74      0.71      0.72      2110
weighted avg       0.79      0.79      0.79      2110



Nice! So after printing out the classification report, we are interested in the precision and recall mostly.

The accuracy for our model is 0.79. This means that our model correctly predicts whether a customer will churn or not and is correct about 79% of the time.

The precision for if a customer will churn is 0.63. This means that when the model predicts if customers will churn, it is correct about it 63% of the time.
The precision for no is 0.84. This means that when the model predicts if a customer won't churn, it is correct about it 84% of the time.

The recall is for YES is 0.52. This means that the model correctly identifies 52% of the customers who actually churned.
The recall for NO is 0.89. This means that the model correctly identifies 89% of the customers who did not churned.

79% accuracy sounds good. However, it is more important to focus on precision and recall and improving these metrics. I'm more interested in identify churners and identifying them correctly in order to prevent them from cancelling their contract and saving the company money. Looking at the support, the dataset has more non-churners than churners so the model might be predicitng non churners more. Since there is a class imbalance, I'll re fit the model and tell the model to pay more attention to churners. 


### Optimizing the model to improve metrics

#### Balancing weight class.

In [114]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

#Increase the number of iterations
logreg = LogisticRegression(max_iter=3000, class_weight='balanced') # told the model to pay more attention to churners by balancing the classes.

# Fit the model to the training data
logreg.fit(X_train, y_train)

# I will now evaluate the model 
y_pred = logreg.predict(X_test) 
#print(logreg.n_iter_)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.72      0.80      1549
           1       0.51      0.81      0.62       561

    accuracy                           0.74      2110
   macro avg       0.71      0.76      0.71      2110
weighted avg       0.80      0.74      0.75      2110



This is exactly what I was looking for, the recall improved significantly, which means my model is catching actual churners 81% of the time. Precision took a hit and it has decreased. However, that's perfectly fine because the business is better off prediciting a non churner as a churner than missing an actual churner and losing money by that customer leaving. Therefore, having 0.81 recall is great. Let's see how I can improve precision without killing recall though. The default threshold is 0.5, so the model can be a little bit more strict on who is classified as a churner. Increasing the threshold should improve precision. 

#### Adjusting classification threshold

In [142]:

# Get predicted probabilities for class 1 (churn)
y_probs = logreg.predict_proba(X_test)[:, 1]

# Set custom threshold
threshold = 0.57

# Convert probabilities to class predictions
y_pred_custom = (y_probs >= threshold).astype(int)

print(classification_report(y_test, y_pred_custom))


              precision    recall  f1-score   support

           0       0.90      0.77      0.83      1549
           1       0.55      0.76      0.64       561

    accuracy                           0.77      2110
   macro avg       0.72      0.77      0.73      2110
weighted avg       0.80      0.77      0.78      2110



Based on the new classification report, the model precision did improve slightly by improving the classification threshold. I can accept a 0.55 precision and a 0.76 recall as a good trade off. Of course, the first model catches more actual churners but then that might it caught those who didnt' actually churn which is fine because we'd rather catch actual churners than let them pass by and the company lose money. However, I wanted to see how more precise could the model get about identifying churners and it went from 51% to 53% and recall dropped from 81% to 76% which is still solid. This is a nice tradeoff. This final model provides balance and it is quite interprettable.  

#### Examining Model Coefficients

In [145]:

# Get coefficients
coef_ = pd.Series(logreg.coef_[0], index=X_train.columns)

# Sort to see most influential features
coef_sorted = coef_.sort_values()

print(coef_sorted)


contract_Two year                       -1.472436
contract_One year                       -0.844823
onlinesecurity_Yes                      -0.271532
dependents_Yes                          -0.247799
techsupport_Yes                         -0.234874
deviceprotection_No internet service    -0.203980
streamingmovies_No internet service     -0.203980
streamingtv_No internet service         -0.203980
techsupport_No internet service         -0.203980
internetservice_No                      -0.203980
onlinesecurity_No internet service      -0.203980
onlinebackup_No internet service        -0.203980
paymentmethod_Mailed check              -0.126880
paymentmethod_Credit card (automatic)   -0.116945
tenure                                  -0.058363
monthlycharges                          -0.025282
totalcharges                             0.000326
onlinebackup_Yes                         0.047259
deviceprotection_Yes                     0.061591
partner_Yes                              0.134994


Based on the model coefficients, having a two year contract reduces churn. Customers with longer contracts tend to stay longer. On the other hand, if you have internet service fiber optic, then those types of customers are most likely to churn. 

#### Preparing for deployment

In [151]:
import pickle

# Save model to pkl file
with open("churn_model.pkl", "wb") as f:
    pickle.dump(logreg, f)


#### Final Thoughts


A good next step is to try other classification models, especially tree based models such as RandomForest.

In [148]:
X_test.sample(20, random_state=42).to_csv("sample_customers.csv", index=False)