In [1]:
# Import necessary libraries
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                         summarize,
                         poly)
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.anova import anova_lm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
#1
data = load_data('Default')
# Dataset dimensions
data.shape

(10000, 4)

In [3]:
# Column names and data types
data.dtypes

default    category
student    category
balance     float64
income      float64
dtype: object

In [4]:
# Distribution of the 'default' variable
data['default'].value_counts()

default
No     9667
Yes     333
Name: count, dtype: int64

In [5]:
# Fit logistic regression to predict default using income, balance, and student
X = pd.get_dummies(data[['income', 'balance', 'student']], drop_first=True)
X['student_Yes'] = X['student_Yes'].astype(int)  
y = (data['default'] == 'Yes').astype(int)
logit_result = sm.Logit(y, sm.add_constant(X)).fit()
summarize(logit_result)

Optimization terminated successfully.
         Current function value: 0.078577
         Iterations 10


Unnamed: 0,coef,std err,z,P>|z|
const,-10.869,0.492,-22.079,0.0
income,3e-06,8e-06,0.37,0.712
balance,0.0057,0.0,24.737,0.0
student_Yes,-0.6468,0.236,-2.738,0.006


Report the coefficient for balance and interpret its meaning in terms of the log-odds of defaulting.

The coefficient for balance in the logistic regression model is approximately 0.0057. This means that for each additional dollar of balance, the log-odds of defaulting increase by 0.0057, holding all other variables constant. In other words, as the balance increases, the likelihood of defaulting also increases.



In [7]:
# Split the Default dataset into training (70%) and testing (30%) sets


train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

X_train = pd.get_dummies(train_data[['income', 'balance']], drop_first=True)
y_train = (train_data['default'] == 'Yes').astype(int)

X_test = pd.get_dummies(test_data[['income', 'balance']], drop_first=True)
y_test = (test_data['default'] == 'Yes').astype(int)

In [13]:
# Linear Discriminant Analysis (LDA)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
ida_predictions = lda.predict(X_test)

# Report class means for each predictor
print('Class means for each predictor:')
print(pd.DataFrame(lda.means_, columns=X_train.columns, index=['No Default', 'Default']))

# Report prior probabilities for each class
print('Prior probabilities for each class:')
print(dict(zip(['No Default', 'Default'], lda.priors_)))

Class means for each predictor:
                  income      balance
No Default  33681.793667   802.158374
Default     31570.357690  1768.165821
Prior probabilities for each class:
{'No Default': np.float64(0.9658571428571429), 'Default': np.float64(0.03414285714285714)}


In [14]:
# Quadratic Discriminant Analysis (QDA)
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, accuracy_score

qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)
qda_predictions = qda.predict(X_test)

# LDA predictions (already computed as ida_predictions)
lda_cm = confusion_matrix(y_test, ida_predictions)
lda_acc = accuracy_score(y_test, ida_predictions)
print('LDA Confusion Matrix:')
print(lda_cm)
print('LDA Test Accuracy:', lda_acc)

# QDA predictions
qda_cm = confusion_matrix(y_test, qda_predictions)
qda_acc = accuracy_score(y_test, qda_predictions)
print('QDA Confusion Matrix:')
print(qda_cm)
print('QDA Test Accuracy:', qda_acc)

LDA Confusion Matrix:
[[2900    6]
 [  75   19]]
LDA Test Accuracy: 0.973
QDA Confusion Matrix:
[[2898    8]
 [  70   24]]
QDA Test Accuracy: 0.974
