In [1]:
# Import necessary libraries
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                         summarize,
                         poly)
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.anova import anova_lm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
#1
data = load_data('Default')
# Dataset dimensions
data.shape

(10000, 4)

In [3]:
# Column names and data types
data.dtypes

default    category
student    category
balance     float64
income      float64
dtype: object

In [4]:
# Distribution of the 'default' variable
data['default'].value_counts()

default
No     9667
Yes     333
Name: count, dtype: int64

In [5]:
# Fit logistic regression to predict default using income, balance, and student
X = pd.get_dummies(data[['income', 'balance', 'student']], drop_first=True)
X['student_Yes'] = X['student_Yes'].astype(int)  
y = (data['default'] == 'Yes').astype(int)
logit_result = sm.Logit(y, sm.add_constant(X)).fit()
summarize(logit_result)

Optimization terminated successfully.
         Current function value: 0.078577
         Iterations 10


Unnamed: 0,coef,std err,z,P>|z|
const,-10.869,0.492,-22.079,0.0
income,3e-06,8e-06,0.37,0.712
balance,0.0057,0.0,24.737,0.0
student_Yes,-0.6468,0.236,-2.738,0.006


Report the coefficient for balance and interpret its meaning in terms of the log-odds of defaulting.

The coefficient for balance in the logistic regression model is approximately 0.0057. This means that for each additional dollar of balance, the log-odds of defaulting increase by 0.0057, holding all other variables constant. In other words, as the balance increases, the likelihood of defaulting also increases.



In [7]:
# Split the Default dataset into training (70%) and testing (30%) sets


train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

X_train = pd.get_dummies(train_data[['income', 'balance']], drop_first=True)
y_train = (train_data['default'] == 'Yes').astype(int)

X_test = pd.get_dummies(test_data[['income', 'balance']], drop_first=True)
y_test = (test_data['default'] == 'Yes').astype(int)

In [13]:
# Linear Discriminant Analysis (LDA)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
ida_predictions = lda.predict(X_test)

# Report class means for each predictor
print('Class means for each predictor:')
print(pd.DataFrame(lda.means_, columns=X_train.columns, index=['No Default', 'Default']))

# Report prior probabilities for each class
print('Prior probabilities for each class:')
print(dict(zip(['No Default', 'Default'], lda.priors_)))

Class means for each predictor:
                  income      balance
No Default  33681.793667   802.158374
Default     31570.357690  1768.165821
Prior probabilities for each class:
{'No Default': np.float64(0.9658571428571429), 'Default': np.float64(0.03414285714285714)}


In [14]:
# Quadratic Discriminant Analysis (QDA)
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, accuracy_score

qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)
qda_predictions = qda.predict(X_test)

# LDA predictions (already computed as ida_predictions)
lda_cm = confusion_matrix(y_test, ida_predictions)
lda_acc = accuracy_score(y_test, ida_predictions)
print('LDA Confusion Matrix:')
print(lda_cm)
print('LDA Test Accuracy:', lda_acc)

# QDA predictions
qda_cm = confusion_matrix(y_test, qda_predictions)
qda_acc = accuracy_score(y_test, qda_predictions)
print('QDA Confusion Matrix:')
print(qda_cm)
print('QDA Test Accuracy:', qda_acc)

LDA Confusion Matrix:
[[2900    6]
 [  75   19]]
LDA Test Accuracy: 0.973
QDA Confusion Matrix:
[[2898    8]
 [  70   24]]
QDA Test Accuracy: 0.974


In [21]:
# Naive Bayes classifier (GaussianNB) 
from sklearn.naive_bayes import GaussianNB

NBC = GaussianNB()
NBC.fit(X_train, y_train)
NBC_predictions = NBC.predict(X_test)
NBC_cm = confusion_matrix(y_test, NBC_predictions)
print('NBC Confusion Matrix:')
print(NBC_cm)


NBC Confusion Matrix:
[[2893   13]
 [  75   19]]


In [22]:
# Compare the test accuracy of Naive Bayes with the LDA and QDA results.

NBC_acc = accuracy_score(y_test, NBC_predictions)
print('Naive Bayes Test Accuracy:', NBC_acc)

# For reference, print LDA and QDA accuracy if not already printed above
print('LDA Test Accuracy:', lda_acc)
print('QDA Test Accuracy:', qda_acc)

Naive Bayes Test Accuracy: 0.9706666666666667
LDA Test Accuracy: 0.973
QDA Test Accuracy: 0.974


In [26]:
# Predict probability of default for a customer with income = 40000 and balance = 2000

customer = pd.DataFrame({'income': [40000], 'balance': [2000]})

# Use LDA
lda_prob = lda.predict_proba(customer)[:, 1]
print(f"LDA predicted probability of default: {lda_prob[0]:.4f}")

# Use QDA
qda_prob = qda.predict_proba(customer)[:, 1]
print(f"QDA predicted probability of default: {qda_prob[0]:.4f}")

# Use Naive Bayes
nbc_prob = NBC.predict_proba(customer)[:, 1]
print(f"Naive Bayes predicted probability of default: {nbc_prob[0]:.4f}")

LDA predicted probability of default: 0.5398
QDA predicted probability of default: 0.5598
Naive Bayes predicted probability of default: 0.5108


In [27]:
# Apply feature scaling to income and balance with StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [38]:
# KNN Models
from sklearn.neighbors import KNeighborsClassifier
knn1 = KNeighborsClassifier(n_neighbors=1)
knn1.fit(X_train, y_train)
knn1_predictions = knn1.predict(X_test)
knn1_acc = accuracy_score(y_test, knn1_predictions)

knn3 = KNeighborsClassifier(n_neighbors=3)
knn3.fit(X_train, y_train)
knn3_predictions = knn3.predict(X_test)
knn3_acc = accuracy_score(y_test, knn3_predictions)

knn5 = KNeighborsClassifier(n_neighbors=5)
knn5.fit(X_train, y_train)
knn5_predictions = knn5.predict(X_test)
knn5_acc = accuracy_score(y_test, knn5_predictions)

knn10 = KNeighborsClassifier(n_neighbors=10)
knn10.fit(X_train, y_train)
knn10_predictions = knn10.predict(X_test)
knn10_cm = confusion_matrix(y_test, knn10_predictions)
knn10_acc = accuracy_score(y_test, knn10_predictions)




In [29]:
# Create a table summarizing the test accuracy for each K value. 
knn_results = pd.DataFrame({
    'K': [1, 3, 5, 10],
    'Test Accuracy': [knn1_acc, knn3_acc, knn5_acc, knn10_acc]
})
knn_results

Unnamed: 0,K,Test Accuracy
0,1,0.952
1,3,0.965333
2,5,0.968667
3,10,0.969


In this case, the model with K=10 has the highest test accuracy of approximately 0.969. This suggests that using a larger value of K helps to smooth out the predictions and reduces the impact of noise in the data, leading to better generalization on unseen data.

The reason why very small values of K may not be optimal is that they can lead to overfitting, where the model captures noise in the training data rather than the underlying patterns. This can result in poor performance on new, unseen data. Larger values of K help to average out the predictions over more neighbors, which can improve robustness and reduce variance in the model's predictions.

In [37]:
# Refit Logistic Regression on the training set and report the test accuracy.

logit = sm.Logit(y_train, sm.add_constant(X_train)).fit()
logit_predictions = logit.predict(sm.add_constant(X_test))
logit_cm = confusion_matrix(y_test, (logit_predictions >= 0.5).astype(int))
logit_acc = accuracy_score(y_test, (logit_predictions >= 0.5).astype(int))

Optimization terminated successfully.
         Current function value: 0.078256
         Iterations 10


In [36]:
# Create a summary table comparing the test accuracy of all methods
summary_results = pd.DataFrame({
    'Model': [
        'Logistic Regression',
        'LDA',
        'QDA',
        'Naive Bayes',
        'KNN10'
    ],
    'Test Accuracy': [
        logit_acc,
        lda_acc,
        qda_acc,
        NBC_acc,
        knn10_acc
    ]
})

summary_results

Unnamed: 0,Model,Test Accuracy
0,Logistic Regression,0.973333
1,LDA,0.973
2,QDA,0.974
3,Naive Bayes,0.970667
4,KNN10,0.969


In [41]:
# Using the confusion matrices, identify which method has the lowest false negative rate

def false_negative_rate(cm):
    return cm[1,0] / (cm[1,0] + cm[1,1])

# Compute FNR for each method
fnr_logit = false_negative_rate(logit_cm)
fnr_lda = false_negative_rate(lda_cm)
fnr_qda = false_negative_rate(qda_cm)
fnr_nb = false_negative_rate(NBC_cm)
fnr_knn = false_negative_rate(knn10_cm)

fnr_results = pd.DataFrame({
    'Model': [
        'Logistic Regression',
        'LDA',
        'QDA',
        'Naive Bayes',
        'KNN10'
    ],
    'False Negative Rate': [
        fnr_logit,
        fnr_lda,
        fnr_qda,
        fnr_nb,
        fnr_knn
    ]
})

fnr_results

Unnamed: 0,Model,False Negative Rate
0,Logistic Regression,0.744681
1,LDA,0.797872
2,QDA,0.744681
3,Naive Bayes,0.797872
4,KNN10,0.957447


The QDA model and the logistic regression model both have a false negative rate of 0.745, which is both the lowest among all the methods.

In [44]:
# If the cost of missing a default is 10 times higher than a false alarm
print(f"Cost of QDA model: {qda_cm[1,0]*10 + qda_cm[0,1]}")

print(f"Cost of Logistic Regression model: {logit_cm[1,0]*10 + logit_cm[0,1]}")


[[2903    3]
 [  90    4]]
[[2896   10]
 [  70   24]]
[[2898    8]
 [  70   24]]


In [48]:
# Adjust the probability threshold from 0.5 to 0.3 in the QDA model and report how this change affects the false positive and false negative rates

qda_predictions_03 = (qda.predict_proba(X_test)[:, 1] >= 0.3).astype(int)
qda_cm_03 = confusion_matrix(y_test, qda_predictions_03)
qda_fnr_03 = false_negative_rate(qda_cm_03)
def false_positive_rate(cm):
    return cm[0,1] / (cm[0,0] + cm[0,1])
qda_fpr_03 = false_positive_rate(qda_cm_03)
qda_fpr = false_positive_rate(qda_cm)

qda_model_change = pd.DataFrame({
    'Metric': ['False Negative Rate', 'False Positive Rate'],
    '0.5': [fnr_qda, qda_fpr],
    '0.3': [qda_fnr_03, qda_fpr_03],
    'Changes': [qda_fnr_03 - fnr_qda, qda_fpr_03 - qda_fpr]
})
qda_model_change

Unnamed: 0,Metric,0.5,0.3,Changes
0,False Negative Rate,0.744681,0.56383,-0.180851
1,False Positive Rate,0.002753,0.016518,0.013765


Changing the threshold from 0.5 to 0.3 would likely decrease the false negative rate by 0.181, but it would also increase the false positive rate by 0.014.