In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)           

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import classification_report, mean_squared_error, r2_score

/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv


--- 

# 📊 Customer Churn 🤔 Explainable AI 🤖

This notebook works with Kaggles [Telco Customer Churn dataset](https://www.kaggle.com/datasets/blastchar/telco-customer-churn/code) which focuses on customer retention programs. 

Note that I am working on this dataset as part of [Duke's Explainable AI Specialization](https://www.coursera.org/specializations/explainable-artificial-intelligence-xai?utm_medium=sem&utm_source=gg&utm_campaign=B2C_EMEA__coursera_FTCOF_career-academy_pmax-multiple-audiences-country-multi&campaignid=20858198824&adgroupid=&device=c&keyword=&matchtype=&network=x&devicemodel=&adposition=&creativeid=&hide_mobile_promo&gad_source=1&gclid=CjwKCAiA3Na5BhAZEiwAzrfagD0AmKcyNfdYYSiw0QGmJSw1OCpwp-8ftFFH1ScD1bI3hjLPyVTqixoCiqwQAvD_BwE) on Coursera. It is part of Course 2, Module 1. The tasks are described as: 

1. Exploratory Data Analysis to check Assumptions: Perform an exploratory analysis of the dataset to understand the relationships between different features and the target variable (churn). Use appropriate visualizations and statistical methods to determine whether assumptions about linear, logistic, and GAM models are met. 
2. Linear Regression: Treat the churn variable as a continuous variable (e.g., 0 for staying, 1 for churning) and build a linear regression model to predict churn. Interpret the coefficients and assess the model's performance.
3. Logistic Regression: Treat churn as a binary variable and build a logistic regression model to predict the probability of churn. Interpret the coefficients.
4. Generalized Additive Model (GAM): Build a GAM to model the non-linear relationships between customer features and churn. Interpret the GAM model. 
5. Model Comparison: Compare the performance and interpretability of the different models you built. Discuss the strengths and weaknesses of each approach and provide recommendations for which model(s) the telecommunications company should use to address their customer churn problem.

In [2]:
df = pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
print(f"Rows: {df.shape[0]:,}")
print(f"Cols: {df.shape[1]:,}")
df.head(3)

Rows: 7,043
Cols: 21


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes


In [3]:
print(df.dtypes)

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object


# EDA

Let's start by getting an overview on how balanced the target of our dataset is. 

In [4]:
value_counts = df['Churn'].value_counts()
percentages = (value_counts / value_counts.sum()) * 100
pd.DataFrame({
    'Count': value_counts,
    'Percentage (%)': percentages
})

Unnamed: 0_level_0,Count,Percentage (%)
Churn,Unnamed: 1_level_1,Unnamed: 2_level_1
No,5174,73.463013
Yes,1869,26.536987


The dataset is fairly unbalanced with having only 1 out of four data points being churn and 3 our of 4 being not churn. 

# Preprocessing 

In [5]:
df['churn_binary'] = df['Churn'].map({'Yes': 1, 'No': 0})
df[['Churn', 'churn_binary']].head(5)

Unnamed: 0,Churn,churn_binary
0,No,0
1,No,0
2,Yes,1
3,No,0
4,Yes,1


In [6]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [7]:
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_columns = [x for x in numeric_columns if x != 'churn_binary']
numeric_columns

['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

In [8]:
# -------------------------- #
# --- Apply LabelEncoder --- #
# -------------------------- #

object_columns = df.select_dtypes(include=['object']).columns
label_encoded_columns = [] 

encoders = {}
for column in object_columns:
    if column == 'customerID': continue
    le = LabelEncoder()

    new_name = column + '_encoded'
    df[new_name] = le.fit_transform(df[column])

    label_encoded_columns.append(new_name)
    
    # Store the label encoders. 
    encoders[column] = le

df[label_encoded_columns].head(5)

Unnamed: 0,gender_encoded,Partner_encoded,Dependents_encoded,PhoneService_encoded,MultipleLines_encoded,InternetService_encoded,OnlineSecurity_encoded,OnlineBackup_encoded,DeviceProtection_encoded,TechSupport_encoded,StreamingTV_encoded,StreamingMovies_encoded,Contract_encoded,PaperlessBilling_encoded,PaymentMethod_encoded,Churn_encoded
0,0,1,0,0,1,0,0,2,0,0,0,0,0,1,2,0
1,1,0,0,1,0,0,2,0,2,0,0,0,1,0,3,0
2,1,0,0,1,0,0,2,2,0,0,0,0,0,1,3,1
3,1,0,0,0,1,0,2,0,2,2,0,0,1,0,0,0
4,0,0,0,1,0,1,0,0,0,0,0,0,0,1,2,1


# Modeling 

In [9]:
df.isna().sum()

customerID                   0
gender                       0
SeniorCitizen                0
Partner                      0
Dependents                   0
tenure                       0
PhoneService                 0
MultipleLines                0
InternetService              0
OnlineSecurity               0
OnlineBackup                 0
DeviceProtection             0
TechSupport                  0
StreamingTV                  0
StreamingMovies              0
Contract                     0
PaperlessBilling             0
PaymentMethod                0
MonthlyCharges               0
TotalCharges                11
Churn                        0
churn_binary                 0
gender_encoded               0
Partner_encoded              0
Dependents_encoded           0
PhoneService_encoded         0
MultipleLines_encoded        0
InternetService_encoded      0
OnlineSecurity_encoded       0
OnlineBackup_encoded         0
DeviceProtection_encoded     0
TechSupport_encoded          0
Streamin

In [10]:
df_cleaned = df.dropna()

In [11]:
X = df_cleaned[label_encoded_columns + numeric_columns]
y = df_cleaned['churn_binary']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

## Linear Regression

In [13]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

y_pred = linear_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': linear_model.coef_
})
performance_metrics = {
    "Mean Squared Error (MSE)": mse,
    "R-squared (R2)": r2
}

print(coefficients)
performance_metrics

                     Feature   Coefficient
0             gender_encoded -1.129123e-16
1            Partner_encoded -2.020953e-16
2         Dependents_encoded  1.132851e-16
3       PhoneService_encoded -5.447240e-16
4      MultipleLines_encoded  2.414677e-16
5    InternetService_encoded  2.724957e-16
6     OnlineSecurity_encoded  9.569728e-17
7       OnlineBackup_encoded -1.316091e-16
8   DeviceProtection_encoded  1.385020e-17
9        TechSupport_encoded  1.713039e-16
10       StreamingTV_encoded  4.190674e-17
11   StreamingMovies_encoded  4.067756e-17
12          Contract_encoded -3.715681e-16
13  PaperlessBilling_encoded -6.357753e-16
14     PaymentMethod_encoded -1.724993e-17
15             Churn_encoded  1.000000e+00
16             SeniorCitizen -1.715312e-16
17                    tenure  7.377996e-17
18            MonthlyCharges -6.552647e-17
19              TotalCharges -2.059984e-18


{'Mean Squared Error (MSE)': 2.64655185220035e-29, 'R-squared (R2)': 1.0}

## Logistic Regression 

In [14]:
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

report = classification_report(y_test, y_pred, output_dict=True)
report

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1033},
 '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 374},
 'accuracy': 1.0,
 'macro avg': {'precision': 1.0,
  'recall': 1.0,
  'f1-score': 1.0,
  'support': 1407},
 'weighted avg': {'precision': 1.0,
  'recall': 1.0,
  'f1-score': 1.0,
  'support': 1407}}

## Generalized Additive Model