In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

# Load the dataset
data = pd.read_csv("wdbc.csv")

# Separate features and target variable
X = data.iloc[:, 2:]  # X-variables (all columns from index 2 onward)
y = data.iloc[:, 1]   # Diagnosis column

# Encode target variable as binary (M = 1, B = 0)
y = y.map({"M": 1, "B": 0})

# Step 1
## Split data, standardize, fit the model, calculate accuracy, and confusion matrix
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

# Standardize X-variables
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fit logistic regression model
log_reg = LogisticRegression(max_iter=5000)
log_reg.fit(X_train, y_train)

# Calculate accuracy
y_pred = log_reg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Answer questions based on confusion matrix
b_b = conf_matrix[0, 0]  # True B predicted as B
b_m = conf_matrix[0, 1]  # True B predicted as M
m_b = conf_matrix[1, 0]  # True M predicted as B
m_m = conf_matrix[1, 1]  # True M predicted as M

print("Question 1")
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print(f"a) Truth B, Prediction B: {b_b}")
print(f"b) Truth B, Prediction M: {b_m}")
print(f"c) Truth M, Prediction B: {m_b}")
print(f"d) Truth M, Prediction M: {m_m}")
print()

# Step 2
## Repeat process 100 times to obtain accuracy estimates
accuracies = []
for _ in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    log_reg.fit(X_train, y_train)
    y_pred = log_reg.predict(X_test)
    accuracies.append(accuracy_score(y_test, y_pred))

# Calculate 95% bootstrap confidence interval
accuracies.sort()
ci_95 = (accuracies[1], accuracies[98])  # Positions 1 and 98 for 95% CI
print("Question 2")
print("95% Confidence Interval for 75% train-test split:", ci_95)
print()

# Step 3
## Repeat with a 10% training and 90% test split
accuracies_10_90 = []
for _ in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.90)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    log_reg.fit(X_train, y_train)
    y_pred = log_reg.predict(X_test)
    accuracies_10_90.append(accuracy_score(y_test, y_pred))

accuracies_10_90.sort()
ci_95_10_90 = (accuracies_10_90[1], accuracies_10_90[98])
print("Question 3")
print("95% Confidence Interval for 10% train-test split:", ci_95_10_90)
print()

# Step 4
## Repeat Step 2 without standardizing the X-variables
accuracies_no_scaling = []
for _ in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    log_reg = LogisticRegression(max_iter=5000)  # Increase iterations
    log_reg.fit(X_train, y_train)
    y_pred = log_reg.predict(X_test)
    accuracies_no_scaling.append(accuracy_score(y_test, y_pred))

accuracies_no_scaling.sort()
ci_95_no_scaling = (accuracies_no_scaling[1], accuracies_no_scaling[98])
print("Question 4")
print("95% Confidence Interval without standardizing:", ci_95_no_scaling)
print()

Question 1
Accuracy: 0.993006993006993
Confusion Matrix:
 [[89  0]
 [ 1 53]]
a) Truth B, Prediction B: 89
b) Truth B, Prediction M: 0
c) Truth M, Prediction B: 1
d) Truth M, Prediction M: 53

Question 2
95% Confidence Interval for 75% train-test split: (0.951048951048951, 1.0)

Question 3
95% Confidence Interval for 10% train-test split: (0.9220272904483431, 0.9766081871345029)

Question 4
95% Confidence Interval without standardizing: (0.9230769230769231, 0.986013986013986)



Question 1:
(q) Calculate the fraction of times this model predicts the correct result on the test set. Compute the confusion matrix on the test set.

(a) 142/143 does the model predict the correct result. And the confusion matrix is listed above.


Question 2:
(q) Report the 95% bootstrap confidence interval for this estimate

(a) (0.951048951048951, 1.0)


Question 3:
(q) How does the confidence interval for #3 compare to the confidence interval for #2?

(a) The confidence interval for #3 appears to be shifted down by .03 relative to #2's CI. Furthermore, we see that there remains approximately a .05 difference between the range limits: (0.9220272904483431, 0.9766081871345029).


Question 4:
(q) How does the confidence interval for #4 compare to the confidence interval for #2?

(a) The difference between CI #2 & #4 is that the latter has a wider and lower CI range. Albeit, the CI's aren't signifcantly different, there still appears to be preference for 75% train-test split: (0.9230769230769231, 0.986013986013986).