In [1]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression  # Using LogisticRegression for L2
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')  # To suppress convergence warnings

# Load pre-computed PCA data
x_train_pca = np.load('x_train_pca95.npy')
x_val_pca = np.load('x_val_pca95.npy')
x_test_pca = np.load('x_test_pca95.npy')
y_train = np.load('y_train.npy')
y_val = np.load('y_val.npy')
y_test = np.load('y_test.npy')

print("PCA data loaded. Shapes:")
print("Train:", x_train_pca.shape)
print("Validation:", x_val_pca.shape)
print("Test:", x_test_pca.shape)

PCA data loaded. Shapes:
Train: (48998, 188)
Validation: (10502, 188)
Test: (10500, 188)


Scaling PCA Data

In [2]:
# Scale the PCA-transformed data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_pca)
x_val_scaled = scaler.transform(x_val_pca)
x_test_scaled = scaler.transform(x_test_pca)

Hyperparameter Tuning (GridSearchCV) - Using Scaled PCA Data

In [3]:
# Hyperparameter Tuning (L2 Regularization)
param_grid_l2 = {
    'C': [1000, 10, 1]  # Inverse of regularization strength (lambda)
}

grid_search_l2 = GridSearchCV(
    LogisticRegression(penalty='l2', solver='lbfgs', max_iter=10000,
                       multi_class='multinomial', random_state=42),
    param_grid_l2,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy'
)

start_time_grid_search_l2 = time.time()
grid_search_l2.fit(x_train_scaled, y_train)
grid_search_time_l2 = time.time() - start_time_grid_search_l2

print("Best C (L2):", grid_search_l2.best_params_['C'])
best_l2_model = grid_search_l2.best_estimator_

Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best C (L2): 10


Evaluation - Using Scaled PCA Data

In [4]:
# Evaluation
start_time_eval = time.time()
y_pred_l2 = best_l2_model.predict(x_test_scaled)
eval_time = time.time() - start_time_eval

l2_accuracy = accuracy_score(y_test, y_pred_l2)
print("\nL2 Regularization with PCA:")
print("Test Accuracy:", l2_accuracy)
print("Evaluation Time:", eval_time)

print("\nClassification Report (L2 with PCA):")
print(classification_report(y_test, y_pred_l2))

print("\nConfusion Matrix (L2 with PCA):")
print(confusion_matrix(y_test, y_pred_l2))

# Overfitting Analysis
y_train_pred_l2 = best_l2_model.predict(x_train_scaled)
train_accuracy_l2 = accuracy_score(y_train, y_train_pred_l2)
print("\nOverfitting Analysis (L2 with PCA):")
print("Training Accuracy:", train_accuracy_l2)
print("Validation Accuracy:", grid_search_l2.best_score_)  # Best CV score is approx. validation accuracy
print("Accuracy Gap (Train - Val):", train_accuracy_l2 - grid_search_l2.best_score_)


L2 Regularization with PCA:
Test Accuracy: 0.8525714285714285
Evaluation Time: 0.03330492973327637

Classification Report (L2 with PCA):
              precision    recall  f1-score   support

           0       0.78      0.82      0.80      1040
           1       0.97      0.95      0.96      1061
           2       0.77      0.75      0.76      1036
           3       0.85      0.86      0.86      1103
           4       0.74      0.79      0.77       995
           5       0.94      0.95      0.94      1075
           6       0.64      0.58      0.61      1083
           7       0.91      0.94      0.93      1032
           8       0.94      0.95      0.94       993
           9       0.95      0.94      0.95      1082

    accuracy                           0.85     10500
   macro avg       0.85      0.85      0.85     10500
weighted avg       0.85      0.85      0.85     10500


Confusion Matrix (L2 with PCA):
[[ 848    2   19   61    2    0   87    1   18    2]
 [   6 1013    6 