In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load MNIST dataset
print("Loading MNIST...")
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, y = mnist["data"], mnist["target"].astype(int)

# Normalize pixel values to [0, 1]
X = X / 255.0

# Train-test split (60k train, 10k test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=10000, random_state=42
)

# PART A: Logistic Regression
print("\nTraining Logistic Regression...")
log_reg = LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=1000)
log_reg.fit(X_train, y_train)

# Get accuracies
train_acc_log = log_reg.score(X_train, y_train)
test_acc_log = log_reg.score(X_test, y_test)

print(f"Logistic Regression - Train: {train_acc_log:.4f}, Test: {test_acc_log:.4f}")

# PART A & B: k-NN with different k values
print("\nTesting k-NN with different k values...")
k_values = [1, 3, 5, 7, 10]
train_acc_knn = []
test_acc_knn = []

for k in k_values:
    print(f"Testing k={k}...")
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)

    train_acc = knn.score(X_train, y_train)
    test_acc = knn.score(X_test, y_test)

    train_acc_knn.append(train_acc)
    test_acc_knn.append(test_acc)

    print(f"k={k}: Train={train_acc:.4f}, Test={test_acc:.4f}")

# Plot results
plt.figure(figsize=(10, 5))

# Plot 1: k-NN accuracy vs k
plt.subplot(1, 2, 1)
plt.plot(k_values, train_acc_knn, 'o-', label="Train Accuracy")
plt.plot(k_values, test_acc_knn, 's-', label="Test Accuracy")
plt.xlabel("k (neighbors)")
plt.ylabel("Accuracy")
plt.title("k-NN Accuracy vs k")
plt.legend()
plt.grid(True)

# Plot 2: Model comparison
plt.subplot(1, 2, 2)
models = ['Logistic Regression', f'Best k-NN (k={k_values[np.argmax(test_acc_knn)]})']
test_scores = [test_acc_log, max(test_acc_knn)]
plt.bar(models, test_scores)
plt.ylabel("Test Accuracy")
plt.title("Model Comparison")
plt.xticks(rotation=45)

# Add values on bars
for i, v in enumerate(test_scores):
    plt.text(i, v + 0.005, f'{v:.3f}', ha='center')

plt.tight_layout()
plt.show()

# Summary
print(f"\nSUMMARY:")
print(f"Best Logistic Regression: {test_acc_log:.4f}")
print(f"Best k-NN: {max(test_acc_knn):.4f} (k={k_values[np.argmax(test_acc_knn)]})")

# Bias-Variance explanation
print(f"\nBias-Variance Trade-off:")
print(f"k=1: Train={train_acc_knn[0]:.4f}, Test={test_acc_knn[0]:.4f} (High Variance)")
print(f"k=10: Train={train_acc_knn[-1]:.4f}, Test={test_acc_knn[-1]:.4f} (High Bias)")

Loading MNIST...

Training Logistic Regression...


