In [1]:
import numpy as np
import pandas as pd

mnist_data = np.load('mnist.npz')

# Loading data 
X_train = mnist_data['x_train']
Y_train = mnist_data['y_train']
X_test = mnist_data['x_test']
Y_test = mnist_data['y_test']

# Filtering data for classes 0 and 1
is_class_01 = (Y_train == 0) | (Y_train == 1)
X_train = X_train[is_class_01]
Y_train = Y_train[is_class_01]

is_class_02 = (Y_test == 0) | (Y_test == 1)
X_test = X_test[is_class_02]
Y_test = Y_test[is_class_02]

# Normalize 
X_train = X_train.reshape(-1, 28 * 28) / 255.0
X_test = X_test.reshape(-1, 28 * 28) / 255.0
print(X_train.shape)

# Relabelling
Y_train = np.where(Y_train==0,-1,1)
Y_test = np.where(Y_test == 0,-1,1)

# Spliting the dataset
num_samples_class_0 = 1000
num_samples_class_1 = 1000

X_train_class_0 = X_train[Y_train == -1]
Y_train_class_0 = Y_train[Y_train == -1]

X_train_class_1 = X_train[Y_train == 1]
Y_train_class_1 = Y_train[Y_train == 1]

# X_test_class_2 = X_test[Y_test == -1]
# Y_test_class_2 = Y_test[Y_test == -1]

# X_test_class_3 = X_test[Y_test == 1]
# Y_test_class_3 = Y_test[Y_test == 1]

print(X_train_class_0)
# Validation set
X_train_val = np.vstack((X_train_class_0[:num_samples_class_0], X_train_class_1[:num_samples_class_1]))
Y_train_val = np.hstack((Y_train_class_0[:num_samples_class_0], Y_train_class_1[:num_samples_class_1]))

# Training set
X_train = np.vstack((X_train_class_0[num_samples_class_0:], X_train_class_1[num_samples_class_1:]))
Y_train = np.hstack((Y_train_class_0[num_samples_class_0:], Y_train_class_1[num_samples_class_1:]))

print(X_train_class_0[num_samples_class_0:].shape)
print(X_train_val.shape)
print(X_train.shape)
print(Y_train_val.shape)

#Mean of x_train and X_test
mean_vector = np.mean(X_train, axis=0)
mean_vector1 = np.mean(X_test, axis=0)

cov_matrix = np.cov((X_train - mean_vector).T)
cov_matrix1 = np.cov((X_test - mean_vector1).T)

eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
eigenvalues1, eigenvectors1 = np.linalg.eigh(cov_matrix1)

# Sort eigenvectors 
sorted_indices = np.argsort(eigenvalues)[::-1]
top_p_indices = sorted_indices[:5]
sorted_indices1 = np.argsort(eigenvalues1)[::-1]
top_p_indices1 = sorted_indices1[:5]

pca_matrix = eigenvectors[:, top_p_indices]
pca_matrix1 = eigenvectors1[:, top_p_indices1]

print(pca_matrix.shape)
X_train_pca = np.dot(X_train - mean_vector, pca_matrix)
X_val_pca = np.dot(X_train_val - mean_vector, pca_matrix)
X_test_pca = np.dot(X_test - mean_vector1, pca_matrix)

print("Shape of reduced training data:", X_train_pca.shape)
print("Shape of reduced validation data:", X_val_pca.shape)


(12665, 784)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(4923, 784)
(2000, 784)
(10665, 784)
(2000,)
(784, 5)
Shape of reduced training data: (10665, 5)
Shape of reduced validation data: (2000, 5)


In [2]:
# Finding unique values for each dimension and Sorting unique values in ascending order 
unique_values_train = [np.unique(X_train_pca[:, i]) for i in range(X_train_pca.shape[1])]
sorted_unique_values_train = [np.sort(vals) for vals in unique_values_train]

unique_values_val = [np.unique(X_val_pca[:, i]) for i in range(X_val_pca.shape[1])]
sorted_unique_values_val = [np.sort(vals) for vals in unique_values_val]

print("training data after PCA:")
for i, vals in enumerate(sorted_unique_values_train):
    print(f"Dimension {i+1}: {vals}")
print("\nvalidation data after PCA:")
for i, vals in enumerate(sorted_unique_values_val):
    print(f"Dimension {i+1}: {vals}")


training data after PCA:
Dimension 1: [-9.14050887 -9.12628801 -8.98098105 ...  4.42735384  4.46459577
  4.4702065 ]
Dimension 2: [-6.06722951 -5.9901054  -5.54732112 ...  3.79543166  3.85617239
  3.90544954]
Dimension 3: [-5.56645578 -5.49364676 -5.47483353 ...  4.95061807  5.00174571
  5.00814084]
Dimension 4: [-5.31601761 -5.31149254 -5.22410596 ...  4.88780727  4.9698394
  5.09538818]
Dimension 5: [-3.6491333  -3.59755316 -3.57806115 ...  4.64196635  4.65935191
  5.42137539]

validation data after PCA:
Dimension 1: [-8.96990919 -8.59099587 -8.47369653 ...  4.32894399  4.3445438
  4.3650654 ]
Dimension 2: [-5.18271121 -4.98273991 -4.97617406 ...  3.61741336  3.78122422
  3.84658567]
Dimension 3: [-5.72106446 -5.32494584 -5.29495264 ...  4.82954624  4.8421774
  4.93588225]
Dimension 4: [-5.33371114 -5.32356401 -5.19678641 ...  4.21573374  4.25409641
  4.28758238]
Dimension 5: [-3.53086225 -3.48735364 -3.44431312 ...  3.64003466  3.83711548
  4.03330041]


In [3]:
# Weight calculation and best stump parameters
weights = np.ones(len(Y_train)) / len(Y_train)
best_dim = None
min_error = float('inf')

best_midpoint = None
# For avoiding division by zero
epsilon = 1e-10

best_direction = None

# Midpoint calculation
for dimension in range(X_train_pca.shape[1]):
    unique_vals = sorted_unique_values_train[dimension]
    
    midpoints = (unique_vals[:-1] + unique_vals[1:]) / 2
    
    for midpoint in midpoints:
        predictions = np.where(X_train_pca[:, dimension] <= midpoint, -1, 1)
        error = np.sum(weights[Y_train != predictions])
        
        #update best parameters
        if error < min_error:
            min_error = error

            best_dim = dimension
            best_midpoint = midpoint
            best_direction = predictions.copy()

  
alpha = 0.5 * np.log((1 - min_error+ epsilon) / (min_error+epsilon))

# Update weights
weights *= np.exp(-alpha * Y_train * best_direction)
weights /= np.sum(weights)

# H1(x)
h1_x = best_direction

print("Alpha:", alpha)

print("Best dim:", best_dim + 1)
print("Best midpoint:", best_midpoint)


Alpha: 2.679000173739841
Best dim: 1
Best midpoint: 0.7445328110204548


In [4]:
predictions_train = []
accuracies_val = []

for iteration in range(30):
    best_dim = None
    best_direction = None
    min_error = float('inf')
    # for avoiding division by zero
    epsilon = 1e-10  

    best_midpoint = None
    for dim in range(X_train_pca.shape[1]):
        unique_vals = sorted_unique_values_train[dim]

        midpoints = (unique_vals[:-1] + unique_vals[1:]) / 2

        # Midpoint calculation
        for midpoint in midpoints:
            predictions = np.where(X_train_pca[:, dim] <= midpoint, -1, 1)
            error = np.sum(weights[Y_train != predictions])

            if error < min_error:
                best_dim = dim
                min_error = error
                best_midpoint = midpoint
                best_direction = predictions.copy()

    alpha = 0.5 * np.log((1 - min_error+ epsilon) / (min_error+ epsilon))


    print("alpha",alpha)


    # Update weights 
    weights *= np.exp(-alpha * Y_train * best_direction)
    weights /= np.sum(weights)

    # H(x)
    h_x = best_direction
    predictions_train.append(h_x)

    # accuracy on validation set
    val_predictions = np.where(X_val_pca[:, best_dim] <= best_midpoint, -1, 1)
    accuracy_val = np.mean(Y_train_val == val_predictions)
    accuracies_val.append(accuracy_val)

    print(f"Iteration {iteration + 1}: Validation Accuracy = {accuracy_val}")

# Ploting accuracy on validation set vs. number of trees
plt.figure(figsize=(10, 10))
plt.plot(range(1, 31), accuracies_val, marker='o', linestyle='-')
plt.title('Accuracy on Validation Set vs. Number of Trees')
plt.xlabel('Number of Trees')
plt.ylabel('Accuracy')
plt.grid(True)
plt.ylim(-5, 1.0)
plt.show()

# index of the best accuracy
best_accuracy_index = np.argmax(accuracies_val)

# Evaluate the best tree on the test set
best_predictions_train = predictions_train[best_accuracy_index]

best_val_predictions = np.where(X_val_pca[:, best_dim] <= best_midpoint, -1, 1)

best_test_predictions = np.where(X_test_pca[:, best_dim] <= best_midpoint, -1, 1)

accuracy_test = np.mean(Y_test== best_test_predictions)

print(f"Best Test Acc = {accuracy_test}")

alpha 0.5870650045280318
Iteration 1: Validation Accuracy = 0.974
alpha 0.7206308952002222
Iteration 2: Validation Accuracy = 0.9725
alpha 0.6933211182847129
Iteration 3: Validation Accuracy = 0.5015
alpha 0.4672848060536943
Iteration 4: Validation Accuracy = 0.6875
alpha 0.47285593002262205
Iteration 5: Validation Accuracy = 0.9725
alpha 0.43952341640152087
Iteration 6: Validation Accuracy = 0.974
alpha 0.4019655511148384
Iteration 7: Validation Accuracy = 0.408
alpha 0.23743042793367233
Iteration 8: Validation Accuracy = 0.983
alpha 0.3618974560353523
Iteration 9: Validation Accuracy = 0.984
alpha 0.2576931419846353
Iteration 10: Validation Accuracy = 0.9725
alpha 0.31616488358467215
Iteration 11: Validation Accuracy = 0.4605
alpha 0.2669646503727792
Iteration 12: Validation Accuracy = 0.678
alpha 0.2508965149374222
Iteration 13: Validation Accuracy = 0.9725
alpha 0.2653905003493111
Iteration 14: Validation Accuracy = 0.6005
alpha 0.2056844187882642
Iteration 15: Validation Accuracy 

NameError: name 'plt' is not defined