# Interpretable Machine Learning Models

Reference for dataset:
https://www.kaggle.com/datasets/blastchar/telco-customer-churn?resource=download

In [17]:
# Import packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod.families import Gamma
from statsmodels.genmod.families.links import Log
from statsmodels.tools import add_constant
from pygam import LinearGAM, GammaGAM, s
import matplotlib.pyplot as plt

In [2]:
# Import datasets
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

## EDA

### Missing Values

In [4]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

### Data Transformation

In [5]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [6]:
# transromm the churn column to binary
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})
df["Churn"].value_counts()

Churn
0    5174
1    1869
Name: count, dtype: int64

In [7]:
# Convert total charges to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
# Fill the null values with the 0
df["TotalCharges"].fillna(0, inplace=True)
df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["TotalCharges"].fillna(0, inplace=True)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [8]:
# get the unique value for each column
for col in df.columns:
    print(f"{col}: {df[col].nunique()}")

customerID: 7043
gender: 2
SeniorCitizen: 2
Partner: 2
Dependents: 2
tenure: 73
PhoneService: 2
MultipleLines: 3
InternetService: 3
OnlineSecurity: 3
OnlineBackup: 3
DeviceProtection: 3
TechSupport: 3
StreamingTV: 3
StreamingMovies: 3
Contract: 3
PaperlessBilling: 2
PaymentMethod: 4
MonthlyCharges: 1585
TotalCharges: 6531
Churn: 2


In [9]:
# Exclude the columns that are not needed for the analysis
Data = df.drop(columns=["customerID"])
# One-hot encoding for categorical variables
Data = pd.get_dummies(Data, drop_first=True)

In [10]:
Data.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,False,True,False,False,True,...,False,False,False,False,False,False,True,False,True,False
1,0,34,56.95,1889.5,0,True,False,False,True,False,...,False,False,False,False,True,False,False,False,False,True
2,0,2,53.85,108.15,1,True,False,False,True,False,...,False,False,False,False,False,False,True,False,False,True
3,0,45,42.3,1840.75,0,True,False,False,False,True,...,False,False,False,False,True,False,False,False,False,False
4,0,2,70.7,151.65,1,False,False,False,True,False,...,False,False,False,False,False,False,True,False,True,False


In [11]:
Data.isnull().sum()

SeniorCitizen                            0
tenure                                   0
MonthlyCharges                           0
TotalCharges                             0
Churn                                    0
gender_Male                              0
Partner_Yes                              0
Dependents_Yes                           0
PhoneService_Yes                         0
MultipleLines_No phone service           0
MultipleLines_Yes                        0
InternetService_Fiber optic              0
InternetService_No                       0
OnlineSecurity_No internet service       0
OnlineSecurity_Yes                       0
OnlineBackup_No internet service         0
OnlineBackup_Yes                         0
DeviceProtection_No internet service     0
DeviceProtection_Yes                     0
TechSupport_No internet service          0
TechSupport_Yes                          0
StreamingTV_No internet service          0
StreamingTV_Yes                          0
StreamingMo

In [12]:
# Split the data into training and testing data and features and target variable
X = Data.drop(columns=["Churn"])
y = Data["Churn"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [13]:
# Scale the parameters
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Data Exploratory

## Modeling

### Linear Regression Model

Reference for linear regression model: https://github.com/AIPI-590-XAI/Duke-AI-XAI/blob/main/interpretable-ml-example-notebooks/regression-interpretability.ipynb

In [14]:
# Build linear regression model
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"\n{model_name}:")
    print(f"MSE: {mse:.2f}")
    print(f"R2 Score: {r2:.2f}")

    for feature, coef in zip(X.columns, model.coef_):
        print(f"{feature}: {coef:.4f}")

    return model, y_pred

In [15]:
# Fit models
lr_model, lr_pred = evaluate_model(
    LinearRegression(),
    X_train_scaled,
    X_test_scaled,
    y_train,
    y_test,
    "Linear Regression",
)


Linear Regression:
MSE: 0.13
R2 Score: 0.31
SeniorCitizen: 0.0135
tenure: -0.0470
MonthlyCharges: -0.1476
TotalCharges: -0.0932
gender_Male: -0.0039
Partner_Yes: 0.0032
Dependents_Yes: -0.0110
PhoneService_Yes: -207619168016.1793
MultipleLines_No phone service: -207619168016.1974
MultipleLines_Yes: 0.0366
InternetService_Fiber optic: 0.1450
InternetService_No: -4437228403627.3809
OnlineSecurity_No internet service: 997875140624.5504
OnlineSecurity_Yes: -0.0146
OnlineBackup_No internet service: 614864191827.9484
OnlineBackup_Yes: -0.0005
DeviceProtection_No internet service: 605557398564.5636
DeviceProtection_Yes: 0.0104
TechSupport_No internet service: 770132565544.6312
TechSupport_Yes: -0.0116
StreamingTV_No internet service: 768379749610.6859
StreamingTV_Yes: 0.0433
StreamingMovies_No internet service: 680419357454.8885
StreamingMovies_Yes: 0.0526
Contract_One year: -0.0442
Contract_Two year: -0.0351
PaperlessBilling_Yes: 0.0225
PaymentMethod_Credit card (automatic): -0.0020
Payment

### Logistic Regression

In [22]:
# build a logistic model
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, y_train)
y_pred_log = log_model.predict(X_test_scaled)

In [25]:
# logistic model results
print("Logistic Regression Results:")
print("Accuracy:", log_model.score(X_test_scaled, y_test))
# print MSE, R2 score, and coefficients
log_mse = mean_squared_error(y_test, y_pred_log)
log_r2 = r2_score(y_test, y_pred_log)
print(f"MSE: {log_mse:.2f}")
print(f"R2 Score: {log_r2:.2f}")
# print coefficients
for feature, coef in zip(X.columns, log_model.coef_[0]):
    print(f"{feature}: {coef:.4f}")

Logistic Regression Results:
Accuracy: 0.8211497515968772
MSE: 0.18
R2 Score: 0.08
SeniorCitizen: 0.0588
tenure: -1.3544
MonthlyCharges: -0.6375
TotalCharges: 0.6554
gender_Male: -0.0255
Partner_Yes: 0.0276
Dependents_Yes: -0.0730
PhoneService_Yes: -0.0350
MultipleLines_No phone service: 0.0350
MultipleLines_Yes: 0.1693
InternetService_Fiber optic: 0.6237
InternetService_No: -0.0759
OnlineSecurity_No internet service: -0.0759
OnlineSecurity_Yes: -0.1580
OnlineBackup_No internet service: -0.0759
OnlineBackup_Yes: -0.0443
DeviceProtection_No internet service: -0.0759
DeviceProtection_Yes: 0.0284
TechSupport_No internet service: -0.0759
TechSupport_Yes: -0.1211
StreamingTV_No internet service: -0.0759
StreamingTV_Yes: 0.1806
StreamingMovies_No internet service: -0.0759
StreamingMovies_Yes: 0.2302
Contract_One year: -0.2675
Contract_Two year: -0.6123
PaperlessBilling_Yes: 0.1640
PaymentMethod_Credit card (automatic): -0.0354
PaymentMethod_Electronic check: 0.1521
PaymentMethod_Mailed check

### GAM

Reference for GAM: https://github.com/AIPI-590-XAI/Duke-AI-XAI/blob/main/interpretable-ml-example-notebooks/generalized-models-interpretability.ipynb

In [19]:
# Fit a Gamma GLM with log link
X_train_const = add_constant(X_train_scaled)
X_test_const = add_constant(X_test_scaled)
glm = GLM(y_train, X_train_const, family=Gamma(link=Log()))
glm_results = glm.fit()

# Make predictions
y_pred_glm = glm_results.predict(X_test_const)

# Calculate MSE and R^2
mse_glm = mean_squared_error(y_test, y_pred_glm)
r2_glm = r2_score(y_test, y_pred_glm)

print(f"GLM MSE: {mse_glm:.4f}, R^2: {r2_glm:.4f}")

# Interpret GLM
print("\nGLM Coefficients:")
for feature, coef in zip(["Intercept"] + list(X.columns), glm_results.params):
    print(f"{feature}: {coef:.4f}")

GLM MSE: 0.1329, R^2: 0.3171

GLM Coefficients:
Intercept: -2.0124
SeniorCitizen: 0.0096
tenure: -0.7822
MonthlyCharges: -4.0370
TotalCharges: 0.5598
gender_Male: 0.0358
Partner_Yes: -0.0709
Dependents_Yes: -0.0762
PhoneService_Yes: 0.3777
MultipleLines_No phone service: -0.3777
MultipleLines_Yes: 0.3554
InternetService_Fiber optic: 1.8576
InternetService_No: -0.2361
OnlineSecurity_No internet service: -0.2361
OnlineSecurity_Yes: 0.1741
OnlineBackup_No internet service: -0.2361
OnlineBackup_Yes: 0.2446
DeviceProtection_No internet service: -0.2361
DeviceProtection_Yes: 0.2843
TechSupport_No internet service: -0.2361
TechSupport_Yes: 0.2216
StreamingTV_No internet service: -0.2361
StreamingTV_Yes: 0.6864
StreamingMovies_No internet service: -0.2361
StreamingMovies_Yes: 0.6951
Contract_One year: -0.2852
Contract_Two year: -0.7423
PaperlessBilling_Yes: 0.1250
PaymentMethod_Credit card (automatic): -0.0474
PaymentMethod_Electronic check: 0.0673
PaymentMethod_Mailed check: -0.0732


## Model Comparison

### Linear Model

### Logistic Regression

### GAM Model