In [1]:
import sys

assert sys.version_info >= (3, 7)

from packaging import version
import sklearn

assert version.parse(sklearn.__version__) >= version.parse("1.0.1")

import matplotlib.pyplot as plt

plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)

In [24]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', as_frame=False)

In [26]:
mnist.data

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [6]:
X, y = mnist.data, mnist.target
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [8]:
X.shape

(70000, 784)

In [16]:
X[7][149]

np.int64(0)

In [17]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [18]:
y_train_3 = (y_train == '3')  # True for all 3s, False for all other digits
y_test_3 = (y_test == '3')

In [19]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_3)

In [20]:
some_digit = X[7]
sgd_clf.predict([some_digit])

array([False])

In [35]:
import numpy as np

def shift_image(image, direction):
    """
    Shifts a flattened MNIST image (784 pixels) by one pixel in a given direction.
    New pixels introduced by the shift are filled with zeros.

    Args:
        image (np.ndarray): A flattened 784-pixel MNIST image.
        direction (str): The direction to shift ('left', 'right', 'up', 'down').

    Returns:
        np.ndarray: The shifted flattened 784-pixel MNIST image.
    """
    image = image.reshape(28, 28) # Reshape to 28x28 grid
    shifted_image = np.zeros(image.shape)

    if direction == 'left':
        shifted_image[:, :-1] = image[:, 1:]
    elif direction == 'right':
        shifted_image[:, 1:] = image[:, :-1]
    elif direction == 'up':
        shifted_image[:-1, :] = image[1:, :]
    elif direction == 'down':
        shifted_image[1:, :] = image[:-1, :]
    else:
        raise ValueError("Direction must be 'left', 'right', 'up', or 'down'")

    return shifted_image.reshape(-1) # Flatten back to 784 pixels

# Task
Augment the `X_train` and `y_train` datasets by creating four shifted copies (left, right, up, down) for each image in `X_train` using the `shift_image` function and concatenating them with the original datasets. Finally, confirm the successful augmentation of `X_train` and `y_train`.

## Create Shifted Copies

### Subtask:
Iterate through each image in the `X_train` dataset. For each image, create four shifted copies (left, right, up, down) using the `shift_image` function. Collect these new images and their corresponding labels.


**Reasoning**:
The subtask requires iterating through the training data, creating shifted copies of each image using the `shift_image` function, and collecting both the original and augmented images along with their labels into new lists. This code block will perform all these operations as per the instructions.



In [36]:
X_train_augmented = []
y_train_augmented = []

for img, label in zip(X_train, y_train):
    X_train_augmented.append(img)
    y_train_augmented.append(label)

    # Generate and append shifted copies
    X_train_augmented.append(shift_image(img, 'left'))
    y_train_augmented.append(label)

    X_train_augmented.append(shift_image(img, 'right'))
    y_train_augmented.append(label)

    X_train_augmented.append(shift_image(img, 'up'))
    y_train_augmented.append(label)

    X_train_augmented.append(shift_image(img, 'down'))
    y_train_augmented.append(label)

print("Augmentation complete. The new datasets are stored in X_train_augmented and y_train_augmented.")

Augmentation complete. The new datasets are stored in X_train_augmented and y_train_augmented.


**Reasoning**:
The subtask is to confirm the successful augmentation of `X_train` and `y_train`. The previous step has already generated the augmented lists. Now, I will convert the augmented lists `X_train_augmented` and `y_train_augmented` into NumPy arrays for consistency and then check their shapes and lengths to confirm the augmentation.



In [38]:
X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)

print(f"Original X_train shape: {X_train.shape}")
print(f"Augmented X_train shape: {X_train_augmented.shape}")
print(f"Original y_train shape: {y_train.shape}")
print(f"Augmented y_train shape: {y_train_augmented.shape}")

if X_train_augmented.shape[0] == X_train.shape[0] * 5 and y_train_augmented.shape[0] == y_train.shape[0] * 5:
    print("Augmentation confirmed: The datasets have been successfully augmented by a factor of 5.")
else:
    print("Augmentation check failed: The shapes of the augmented datasets are not as expected.")

Original X_train shape: (60000, 784)
Augmented X_train shape: (300000, 784)
Original y_train shape: (60000,)
Augmented y_train shape: (300000,)
Augmentation confirmed: The datasets have been successfully augmented by a factor of 5.


## Summary:

### Data Analysis Key Findings
*   The `X_train` and `y_train` datasets were successfully augmented by creating four shifted copies (left, right, up, down) for each original image and its label, in addition to the original data.
*   The original `X_train` dataset had a shape of (60000, 784), and after augmentation, `X_train_augmented` grew to a shape of (300000, 784). This represents a 5x increase in the number of samples.
*   Similarly, the `y_train` dataset, with an original shape of (60000,), was augmented to `y_train_augmented` with a shape of (300000,), also indicating a 5x increase in the number of labels corresponding to the augmented images.
*   The augmentation process was confirmed to have successfully increased the dataset size by a factor of 5, as intended.

### Insights or Next Steps
*   This augmented dataset can now be used for training a machine learning model, which is expected to improve the model's generalization capabilities and robustness to minor shifts or translations in input images.
*   Consider exploring other augmentation techniques, such as rotations, scaling, or noise injection, to further enhance the diversity of the training data.


# Task
Initialize, train, and evaluate a K-Nearest Neighbors Classifier model using the augmented training data (`X_train_augmented`, `y_train_augmented`) and the test data (`X_test`, `y_test_3`), then summarize the model's performance by reporting its accuracy.

## Initialize KNN Model

### Subtask:
Initialize a K-Nearest Neighbors Classifier model from `sklearn.neighbors`.


**Reasoning**:
The previous code block failed because `KNeighborsClassifier` was not defined. This indicates that the import statement for `KNeighborsClassifier` was not executed or not effective. To fix this, I will re-include the import statement in the current code block before initializing the classifier.



In [41]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
print("KNeighborsClassifier model initialized.")

KNeighborsClassifier model initialized.


**Reasoning**:
With the `KNeighborsClassifier` successfully initialized, the next step is to train the model using the augmented training data, `X_train_augmented` and `y_train_augmented`.



In [42]:
knn_clf.fit(X_train_augmented, y_train_augmented)
print("KNeighborsClassifier model trained successfully using augmented data.")

KNeighborsClassifier model trained successfully using augmented data.


## Evaluate KNN Model

### Subtask:
Make predictions on the test set (`X_test`) using the trained `knn_clf` model. These predictions will be used to calculate the model's accuracy.

#### Instructions:
1. Use the `predict` method of the `knn_clf` model to get predictions for `X_test`.

**Reasoning**:
As outlined in the subtask, the next step is to use the trained `knn_clf` model to make predictions on the `X_test` dataset. This will generate the predicted labels which are necessary for evaluating the model's performance.



In [43]:
y_pred = knn_clf.predict(X_test)
print("Predictions on X_test generated successfully.")

Predictions on X_test generated successfully.


**Reasoning**:
The next step is to evaluate the model's performance by calculating its accuracy. I will use `sklearn.metrics.accuracy_score` to compare the predicted labels (`y_pred`) with the true labels (`y_test`).



In [44]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"KNN Model Accuracy on the test set: {accuracy:.4f}")

KNN Model Accuracy on the test set: 0.9754


## Final Task

### Subtask:
Provide a summary of the KNN model's training and evaluation, including the measured accuracy on the test set.


## Summary:

### Q&A
The K-Nearest Neighbors Classifier model achieved an accuracy of 0.9754 on the test set.

### Data Analysis Key Findings
*   A K-Nearest Neighbors Classifier model was successfully initialized and trained using augmented training data.
*   Predictions were generated on the test set using the trained model.
*   The model's accuracy on the test set was calculated to be 0.9754.

### Insights or Next Steps
*   The high accuracy of 0.9754 suggests that the KNN model, trained with augmented data, performs very well in classifying the test set.
*   Further analysis could involve exploring other performance metrics (e.g., precision, recall, F1-score) or cross-validation to ensure the model's robustness and generalizability.
