In [2]:
import sys

assert sys.version_info >= (3, 7)

It also requires Scikit-Learn ≥ 1.0.1:

In [3]:
from packaging import version
import sklearn

assert version.parse(sklearn.__version__) >= version.parse("1.0.1")

Just like in the previous chapter, let's define the default font sizes to make the figures prettier:

In [4]:
import matplotlib.pyplot as plt

plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)

And let's create the `images/classification` folder (if it doesn't already exist), and define the `save_fig()` function which is used through this notebook to save the figures in high-res for the book:

In [5]:
from pathlib import Path

IMAGES_PATH = Path() / "images" / "classification"
IMAGES_PATH.mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = IMAGES_PATH / f"{fig_id}.{fig_extension}"
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# MNIST

The website hosting the MNIST data seems to be down at the moment.  You can load the data in from a csv that I've put into github.

In [6]:
# from sklearn.datasets import fetch_openml
# mnist = fetch_openml('mnist_784', as_frame=False)

# # extra code – it's a bit too long
# print(mnist.DESCR)

# mnist.keys()  # extra code – we only use data and target in this notebook

# X, y = mnist.data, mnist.target

In [None]:
import pandas as pd
df = pd.read_csv('C:/Users/heath/Documents/mnist_784.csv')
df.head(1000)

In [None]:
df.describe()


In [None]:
df.info()

In [None]:
df.shape[0]

### Q1:
What does `head(2)` do?

### A1:
head(2) shows the first 2 entries of a data frame. 

In [11]:
X = df.iloc[:, 0:784].values
y = df['class'].astype(str).values

### Q2:
 What is the `iloc` operator?  How is it different from the `loc` operator?

### A2:
Both iloc and loc are slicing operators that pull out the specified rows and columns in the given square brackets. "iloc' does this based on index, hence the "i". While "loc" does this based on label.

In [None]:
X

In [None]:
X.shape

In [None]:
y

In [None]:
y.shape

In [None]:
28 * 28

In [None]:
X[0].shape

In [None]:
test = df[0]
final_el = test[-1]
print(final_el)
print(X[0])

In [None]:
import matplotlib.pyplot as plt

def plot_digit(image_data):
    image = image_data.reshape(28, 28)
    plt.imshow(image, cmap="binary")
    plt.axis("off")

some_digit = X[0]
plot_digit(some_digit)
save_fig("some_digit_plot")  # extra code
plt.show()

# **Modified Cell**

In [37]:
import numpy as np

def shift_image(image):

    # if image.shape != (28, 28):
    #     raise ValueError(f"Invalid pixel dimensions on image {image}")

    # capture class value
    target = image[-1]
    # remove class value from row of data so can isolate image pixels
    shift_image = image[1:-1]
    # make pixel array into 2d array for simplicity to work with
    shift_image = shift_image.reshape(28, 28)

    # fill padded variables
    up_image = np.zeros_like(shift_image)
    down_image = np.zeros_like(shift_image)
    left_image = np.zeros_like(shift_image)
    right_image = np.zeros_like(shift_image)

    # shift images and reformat them to be added to data set
    # then add the target value back in
    # ups
    up_image[:-1, :] = shift_image[1:, :]
    up_image.reshape(784,)
    up_image = np.append(up_image, target)
    # downs
    down_image[1:, :] = shift_image[:-1, :]
    down_image.reshape(784,)
    down_image = np.append(down_image, target)
    # lefts
    left_image[:, :-1] = shift_image[:, 1:]
    left_image.reshape(784,)
    left_image = np.append(left_image, target)
    # rights
    right_image[:, 1:] = shift_image[:, :-1]
    right_image.reshape(784,)
    right_image = np.append(right_image, target)

    # pd.concat only accepts series and data frames 
    # so i converted all the numpy arrays into dataframes
    up = pd.DataFrame(up_image)
    down = pd.DataFrame(down_image)
    left = pd.DataFrame(left_image)
    right = pd.DataFrame(right_image)   

    ret = numpy.concatenate([image.reshape(1,sz**2) for image in [digit_image, digit_image_u, digit_image_d, digit_image_l, digit_image_r]])
    return ret
    return up, down, left, right
    # plt.imshow(down_image, cmap='binary')
    # plt.axis('off')

    

# shift_image(X[0])
# plt.show()

In [21]:
# row = df.iloc[0].values
# up, down, left, right = shift_image(row)
# print(up)
# print(down)
# # concatonate 4 copies to original dataframe
# df = pd.concat([df, up])
# df = pd.concat([df, down])
# df = pd.concat([df, left])
# df = pd.concat([df, right])

# df.info()

In [None]:
rows = df.shape[0]
for i in range(rows):
    row = np.array(df.iloc[i].values)
    up, down, left, right = shift_image(row)
    # concatonate 4 copies to original dataframe
    df = pd.concat([df, up])
    df = pd.concat([df, down])
    df = pd.concat([df, left])
    df = pd.concat([df, right])

df.info()

In [None]:
df.info()

In [None]:
# extra code – this cell generates and saves Figure 3–2
plt.figure(figsize=(9, 9))
for idx, image_data in enumerate(X[:100]):
    plt.subplot(10, 10, idx + 1)
    plot_digit(image_data)
plt.subplots_adjust(wspace=0, hspace=0)
save_fig("more_digits_plot", tight_layout=False)
plt.show()

### Q3: 
What does the Python function `enumerate` do?

### A3:
Enumerate is a built-in function that makes it simpler to keep track of the indices when iterating over an iterable object. 

### Q4: 
What is an SGD Classifier?  (Hint:  go to the documentation on the sklearn website).  


### A4: An SGD classifier is an algorithm that can process large datasets efficiently by taking random point data. It classifies by finding dividing lines suitable for the dataset. Three methods are batch gradient descent, stochastic gradient descent, and mini-batch gradient descent

### Q5:
Describe what is going on inside the `for` loop 



### A5: The for loop creates a classifier using the clone and validates predictions using training sets and validation sets

### Q6:
Give an example of how you would use the confusion matrix in a report on your model, i.e. write a sentence that gives the reader an idea of performance on the task of predicting 5s

The Confusion Matrix is telling us that the model is imperfect because false negatives and false positives are being picked up. Model is imperfect. We can further get a calculation of precision by dividing the number of true positives by the total positives

### A6:

### Q7:
What's going on with the funky blue squiggles in the upper right hand corner?

### A7:

In general, precision will increase with threshold, but isn't guaranteed. The blue squiggles are indicative of an imperfect relationship. Has to do with false negatives becoming true negatives when the threshold increases. Some true positives are going to become false negatives as threshold increases. 

### Q8:
Explain in simple words why the precision does *not* go to zero as the threshold moves lower, but the recall *does* always go to one

### A8:
even with a low threshold, the model  still could correctly identify some positive cases, meaning it won't classify everything as positive, maintains some level of precision. recall always reacesh one at a threshold at a low enough levl because when almost everything is classified as positive, model captures all true positive cases, leads to perfect recall even when it also includes many false positives. 

### Q9: 
Which do you find more informative, the precision recall curve or the ROC curve?  Why?

### A9:

Since the positive class is not rare in our dataset, the ROC graph is more informative for us. We also don't particulary care about false positives more than negatives

In [27]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
shuffle_index = np.random.permutation(len(X_train))
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

In [None]:
X_train_shifted = np.concatenate([shift_image(image) for image in X_train])

In [None]:
X_train_shifted.shape 

In [None]:
a = np.asarray([[0], [1]])
a.repeat(2, axis=1).reshape(-1, 1)

In [None]:
y_train_shifed = y_train.repeat(5, axis=1).reshape(-1, 1)
y_train_shifed.shape

In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

forest_clf = RandomForestClassifier(random_state=42)

In [None]:
cross_val_score(forest_clf, X_train, y_train, cv=3, scoring="accuracy")

In [None]:
cross_val_score(forest_clf, X_train_shifted, y_train_shifed, cv=3, scoring="accuracy")