### Classification on MNIST

In [19]:
# loading in packages
import pandas as pd
import os

import numpy as np
from sklearn.decomposition import PCA

### Loading in data

In [26]:
# loading in data
file_path_train = os.path.join("data", "mnist_train.csv")
file_path_test = os.path.join("data", "mnist_test.csv")

train_df = pd.read_csv(file_path_train)
test_df = pd.read_csv(file_path_test)

x_train = train_df.drop('label', axis = 1)
y_train = train_df['label']

x_test = test_df.drop('label', axis = 1)
y_test = test_df['label']

### Data Preprocessing
We need to normalize the data by dividing everything but the label by 255. 255 because we're working with computer vision.

In [28]:
# Divide all of predictors by 255
x_train = x_train/255
x_test = x_test/255

# do this if we did not separate labels and predictors into different dataframes.
# train_df.loc[:, train_df.columns != 'label'] /= 255

Because the dataset is so big, we'll conduct PCA for dimension reduction. We want to keep enough principal components that it explains 90% of the variance.

In [29]:
# fitting the PCA
pca = PCA()
pca.fit(x_train)

In [35]:
# getting enough principal components to explain 90% of variance in the training data.
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
n_components = np.argmax(cumulative_variance_ratio >= 0.9) + 1

# pca.transform returns a numpy array
pca_train = pca.transform(x_train)[:, :n_components]

# we turn the numpy array into a pandas dataframe
pca_train_df = pd.DataFrame(pca_train, columns=[f"PC{i+1}" for i in range(n_components)])

print('We keep', n_components, 'principal components')

We keep 87 principal components


### Logistic Regression