In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split


# PART 1:

In [3]:
data_full = pd.read_csv('/media/mnist_dataset (1) (1).csv')

# Checking the first few rows to understand the structure
print(data_full.head())


   5  0  0.1  0.2  0.3  0.4  0.5  0.6  0.7  0.8  ...  0.608  0.609  0.610  \
0  0  0    0    0    0    0    0    0    0    0  ...      0      0      0   
1  4  0    0    0    0    0    0    0    0    0  ...      0      0      0   
2  1  0    0    0    0    0    0    0    0    0  ...      0      0      0   
3  9  0    0    0    0    0    0    0    0    0  ...      0      0      0   
4  2  0    0    0    0    0    0    0    0    0  ...      0      0      0   

   0.611  0.612  0.613  0.614  0.615  0.616  0.617  
0      0      0      0      0      0      0      0  
1      0      0      0      0      0      0      0  
2      0      0      0      0      0      0      0  
3      0      0      0      0      0      0      0  
4      0      0      0      0      0      0      0  

[5 rows x 785 columns]


In [4]:
# Rename the first column to 'label'
data_full = data_full.rename(columns={data_full.columns[0]: 'label'})

# Verify the column names
print("Column names:", data_full.columns.tolist())


Column names: ['label', '0', '0.1', '0.2', '0.3', '0.4', '0.5', '0.6', '0.7', '0.8', '0.9', '0.10', '0.11', '0.12', '0.13', '0.14', '0.15', '0.16', '0.17', '0.18', '0.19', '0.20', '0.21', '0.22', '0.23', '0.24', '0.25', '0.26', '0.27', '0.28', '0.29', '0.30', '0.31', '0.32', '0.33', '0.34', '0.35', '0.36', '0.37', '0.38', '0.39', '0.40', '0.41', '0.42', '0.43', '0.44', '0.45', '0.46', '0.47', '0.48', '0.49', '0.50', '0.51', '0.52', '0.53', '0.54', '0.55', '0.56', '0.57', '0.58', '0.59', '0.60', '0.61', '0.62', '0.63', '0.64', '0.65', '0.66', '0.67', '0.68', '0.69', '0.70', '0.71', '0.72', '0.73', '0.74', '0.75', '0.76', '0.77', '0.78', '0.79', '0.80', '0.81', '0.82', '0.83', '0.84', '0.85', '0.86', '0.87', '0.88', '0.89', '0.90', '0.91', '0.92', '0.93', '0.94', '0.95', '0.96', '0.97', '0.98', '0.99', '0.100', '0.101', '0.102', '0.103', '0.104', '0.105', '0.106', '0.107', '0.108', '0.109', '0.110', '0.111', '0.112', '0.113', '0.114', '0.115', '0.116', '0.117', '0.118', '0.119', '0.120',

In [5]:
# Convert column names to strings
data_full.columns = data_full.columns.astype(str)


In [6]:
# Check data types
print(data_full.dtypes)

# Convert all columns except 'label' to numeric
feature_columns = data_full.columns.drop('label')
data_full[feature_columns] = data_full[feature_columns].apply(pd.to_numeric)


label    int64
0        int64
0.1      int64
0.2      int64
0.3      int64
         ...  
0.613    int64
0.614    int64
0.615    int64
0.616    int64
0.617    int64
Length: 785, dtype: object


In [7]:
# Exclude the label column
predictor_columns = data_full.columns.drop('label')

# Calculate the proportion of non-zero entries for each column
non_zero_proportions = (data_full[predictor_columns] != 0).mean()

# Select columns with at least 30% non-zero entries
columns_with_enough_non_zero = non_zero_proportions[non_zero_proportions >= 0.30].index.tolist()

# Check if we have at least 50 columns to choose from
if len(columns_with_enough_non_zero) < 50:
    print(f"Only {len(columns_with_enough_non_zero)} columns have at least 30% non-zero values.")
    # Select the top 50 columns with the highest non-zero proportions
    columns_with_enough_non_zero = non_zero_proportions.sort_values(ascending=False).head(50).index.tolist()
else:
    # Randomly select 50 columns from these columns
    np.random.seed(42)  # For reproducibility
    columns_with_enough_non_zero = np.random.choice(columns_with_enough_non_zero, size=50, replace=False)

# Create a new DataFrame with the label and selected predictor columns
data_selected = data_full[['label'] + list(columns_with_enough_non_zero)]


In [8]:
# Separate features and labels
X = data_selected.drop('label', axis=1)
y = data_selected['label']

# Split into training (70%) and temporary (30%) sets
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# Split the temporary set equally into validation (15%) and test (15%) sets
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

# Display the sizes of each set
print("Training set size:", X_train.shape[0])
print("Validation set size:", X_val.shape[0])
print("Test set size:", X_test.shape[0])


Training set size: 41999
Validation set size: 9000
Test set size: 9000


# **PART 2:**

In [9]:
# Normalize the features to [0, 1]
X_train_scaled = X_train / 255.0
X_val_scaled = X_val / 255.0
X_test_scaled = X_test / 255.0


In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Define the parameter grid
k_values = [3, 5, 7, 9]
weights_options = ['uniform', 'distance']
metric_options = ['euclidean', 'manhattan']

best_accuracy = 0
best_params = {}

for k in k_values:
    for weight in weights_options:
        for metric in metric_options:
            knn = KNeighborsClassifier(
                n_neighbors=k, weights=weight, metric=metric
            )
            knn.fit(X_train_scaled, y_train)
            y_val_pred = knn.predict(X_val_scaled)
            accuracy = accuracy_score(y_val, y_val_pred)
            print(
                f"k={k}, weights={weight}, metric={metric}, val_accuracy={accuracy:.4f}"
            )
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params = {
                    'n_neighbors': k,
                    'weights': weight,
                    'metric': metric
                }

print("\nBest Validation Accuracy:", best_accuracy)
print("Best Hyperparameters:", best_params)


k=3, weights=uniform, metric=euclidean, val_accuracy=0.9170
k=3, weights=uniform, metric=manhattan, val_accuracy=0.9076
k=3, weights=distance, metric=euclidean, val_accuracy=0.9200
k=3, weights=distance, metric=manhattan, val_accuracy=0.9114
k=5, weights=uniform, metric=euclidean, val_accuracy=0.9192
k=5, weights=uniform, metric=manhattan, val_accuracy=0.9062
k=5, weights=distance, metric=euclidean, val_accuracy=0.9227
k=5, weights=distance, metric=manhattan, val_accuracy=0.9112
k=7, weights=uniform, metric=euclidean, val_accuracy=0.9164
k=7, weights=uniform, metric=manhattan, val_accuracy=0.9050
k=7, weights=distance, metric=euclidean, val_accuracy=0.9186
k=7, weights=distance, metric=manhattan, val_accuracy=0.9093
k=9, weights=uniform, metric=euclidean, val_accuracy=0.9160
k=9, weights=uniform, metric=manhattan, val_accuracy=0.9032
k=9, weights=distance, metric=euclidean, val_accuracy=0.9188
k=9, weights=distance, metric=manhattan, val_accuracy=0.9084

Best Validation Accuracy: 0.922

In [11]:
# Retrain KNN with the best hyperparameters on the training set
best_knn = KNeighborsClassifier(**best_params)
best_knn.fit(X_train_scaled, y_train)

# Test the model
y_test_pred_knn = best_knn.predict(X_test_scaled)

# Calculate metrics
from sklearn.metrics import classification_report

print("KNN Classification Report on Test Set:")
print(classification_report(y_test, y_test_pred_knn))


KNN Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       888
           1       0.92      0.99      0.95      1011
           2       0.95      0.93      0.94       893
           3       0.89      0.91      0.90       920
           4       0.94      0.91      0.92       877
           5       0.89      0.87      0.88       813
           6       0.96      0.96      0.96       888
           7       0.92      0.92      0.92       940
           8       0.96      0.87      0.91       877
           9       0.89      0.91      0.90       893

    accuracy                           0.93      9000
   macro avg       0.93      0.93      0.93      9000
weighted avg       0.93      0.93      0.93      9000



# PART 3:

In [12]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit on the training data
scaler.fit(X_train)

# Transform the data
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


In [13]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Define the parameter grid
hidden_layer_sizes_options = [(50,), (100,), (50, 50)]
activation_options = ['tanh', 'relu']
solver_options = ['sgd', 'adam']
learning_rate_init_options = [0.001, 0.01]

best_accuracy = 0
best_params = {}

for hidden_layer_sizes in hidden_layer_sizes_options:
    for activation in activation_options:
        for solver in solver_options:
            for learning_rate_init in learning_rate_init_options:
                mlp = MLPClassifier(
                    hidden_layer_sizes=hidden_layer_sizes,
                    activation=activation,
                    solver=solver,
                    learning_rate_init=learning_rate_init,
                    max_iter=200,
                    random_state=42
                )
                mlp.fit(X_train_scaled, y_train)
                y_val_pred = mlp.predict(X_val_scaled)
                accuracy = accuracy_score(y_val, y_val_pred)
                print(
                    f"hidden_layer_sizes={hidden_layer_sizes}, activation={activation}, solver={solver}, learning_rate_init={learning_rate_init}, val_accuracy={accuracy:.4f}"
                )
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params = {
                        'hidden_layer_sizes': hidden_layer_sizes,
                        'activation': activation,
                        'solver': solver,
                        'learning_rate_init': learning_rate_init
                    }

print("\nBest Validation Accuracy:", best_accuracy)
print("Best Hyperparameters:", best_params)




hidden_layer_sizes=(50,), activation=tanh, solver=sgd, learning_rate_init=0.001, val_accuracy=0.8924




hidden_layer_sizes=(50,), activation=tanh, solver=sgd, learning_rate_init=0.01, val_accuracy=0.9150




hidden_layer_sizes=(50,), activation=tanh, solver=adam, learning_rate_init=0.001, val_accuracy=0.9169




hidden_layer_sizes=(50,), activation=tanh, solver=adam, learning_rate_init=0.01, val_accuracy=0.8927




hidden_layer_sizes=(50,), activation=relu, solver=sgd, learning_rate_init=0.001, val_accuracy=0.9108




hidden_layer_sizes=(50,), activation=relu, solver=sgd, learning_rate_init=0.01, val_accuracy=0.9250




hidden_layer_sizes=(50,), activation=relu, solver=adam, learning_rate_init=0.001, val_accuracy=0.9243
hidden_layer_sizes=(50,), activation=relu, solver=adam, learning_rate_init=0.01, val_accuracy=0.9239




hidden_layer_sizes=(100,), activation=tanh, solver=sgd, learning_rate_init=0.001, val_accuracy=0.9057




hidden_layer_sizes=(100,), activation=tanh, solver=sgd, learning_rate_init=0.01, val_accuracy=0.9257




hidden_layer_sizes=(100,), activation=tanh, solver=adam, learning_rate_init=0.001, val_accuracy=0.9127
hidden_layer_sizes=(100,), activation=tanh, solver=adam, learning_rate_init=0.01, val_accuracy=0.8982




hidden_layer_sizes=(100,), activation=relu, solver=sgd, learning_rate_init=0.001, val_accuracy=0.9257




hidden_layer_sizes=(100,), activation=relu, solver=sgd, learning_rate_init=0.01, val_accuracy=0.9337




hidden_layer_sizes=(100,), activation=relu, solver=adam, learning_rate_init=0.001, val_accuracy=0.9278
hidden_layer_sizes=(100,), activation=relu, solver=adam, learning_rate_init=0.01, val_accuracy=0.9247




hidden_layer_sizes=(50, 50), activation=tanh, solver=sgd, learning_rate_init=0.001, val_accuracy=0.9138




hidden_layer_sizes=(50, 50), activation=tanh, solver=sgd, learning_rate_init=0.01, val_accuracy=0.9173




hidden_layer_sizes=(50, 50), activation=tanh, solver=adam, learning_rate_init=0.001, val_accuracy=0.9077
hidden_layer_sizes=(50, 50), activation=tanh, solver=adam, learning_rate_init=0.01, val_accuracy=0.8848




hidden_layer_sizes=(50, 50), activation=relu, solver=sgd, learning_rate_init=0.001, val_accuracy=0.9257




hidden_layer_sizes=(50, 50), activation=relu, solver=sgd, learning_rate_init=0.01, val_accuracy=0.9242




hidden_layer_sizes=(50, 50), activation=relu, solver=adam, learning_rate_init=0.001, val_accuracy=0.9210
hidden_layer_sizes=(50, 50), activation=relu, solver=adam, learning_rate_init=0.01, val_accuracy=0.9189

Best Validation Accuracy: 0.9336666666666666
Best Hyperparameters: {'hidden_layer_sizes': (100,), 'activation': 'relu', 'solver': 'sgd', 'learning_rate_init': 0.01}


In [14]:
# Retrain MLPClassifier with the best hyperparameters
best_mlp = MLPClassifier(
    **best_params, max_iter=200, random_state=42
)
best_mlp.fit(X_train_scaled, y_train)

# Test the model
y_test_pred_mlp = best_mlp.predict(X_test_scaled)

# Calculate metrics
print("Neural Network Classification Report on Test Set:")
print(classification_report(y_test, y_test_pred_mlp))


Neural Network Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       888
           1       0.96      0.98      0.97      1011
           2       0.93      0.93      0.93       893
           3       0.92      0.90      0.91       920
           4       0.93      0.94      0.94       877
           5       0.91      0.90      0.90       813
           6       0.95      0.95      0.95       888
           7       0.94      0.94      0.94       940
           8       0.92      0.92      0.92       877
           9       0.91      0.92      0.92       893

    accuracy                           0.94      9000
   macro avg       0.93      0.93      0.93      9000
weighted avg       0.94      0.94      0.94      9000





# PART 4:

In [15]:
from sklearn.metrics import classification_report
import pandas as pd

# For KNN model
knn_report = classification_report(y_test, y_test_pred_knn, output_dict=True)

# For Neural Network model
mlp_report = classification_report(y_test, y_test_pred_mlp, output_dict=True)

# Extract overall metrics
metrics = ['accuracy', 'precision', 'recall', 'f1-score']

# For accuracy, we can get it directly from the reports
knn_accuracy = knn_report['accuracy']
mlp_accuracy = mlp_report['accuracy']

# For precision, recall, and f1-score, we can use 'weighted avg'
knn_precision = knn_report['weighted avg']['precision']
knn_recall = knn_report['weighted avg']['recall']
knn_f1 = knn_report['weighted avg']['f1-score']

mlp_precision = mlp_report['weighted avg']['precision']
mlp_recall = mlp_report['weighted avg']['recall']
mlp_f1 = mlp_report['weighted avg']['f1-score']

# Create a DataFrame to display the metrics
data = {
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score'],
    'KNN Model': [
        f"{knn_accuracy * 100:.2f}%",
        f"{knn_precision * 100:.2f}%",
        f"{knn_recall * 100:.2f}%",
        f"{knn_f1 * 100:.2f}%"
    ],
    'Neural Network Model': [
        f"{mlp_accuracy * 100:.2f}%",
        f"{mlp_precision * 100:.2f}%",
        f"{mlp_recall * 100:.2f}%",
        f"{mlp_f1 * 100:.2f}%"
    ]
}

comparison_df = pd.DataFrame(data)
print("Comparison of KNN and Neural Network Models:")
print(comparison_df)



Comparison of KNN and Neural Network Models:
      Metric KNN Model Neural Network Model
0   Accuracy    92.64%               93.56%
1  Precision    92.68%               93.55%
2     Recall    92.64%               93.56%
3   F1-Score    92.62%               93.55%
