# PCA + Logistic Regression (MNIST)

In [1]:
from sklearn.datasets import fetch_mldata
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
from mlxtend.data import loadlocal_mnist

In [3]:
mnist = fetch_mldata('MNIST original')



In [4]:
mnist

{'DESCR': 'mldata.org dataset: mnist-original',
 'COL_NAMES': ['label', 'data'],
 'target': array([0., 0., 0., ..., 9., 9., 9.]),
 'data': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)}

In [5]:
# images
mnist.data.shape

(70000, 784)

In [6]:
# labels
mnist.target.shape

(70000,)

In [7]:
#splitting data into training and test sets
train_img, test_img, train_lbl, test_lbl = train_test_split(
    mnist.data, mnist.target, test_size=1/7.0, random_state=0)

In [8]:
print(train_img.shape)

(60000, 784)


In [9]:
print(train_lbl.shape)

(60000,)


In [10]:
print(test_img.shape)

(10000, 784)


In [11]:
print(test_lbl.shape)

(10000,)


In [12]:
#standardize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Fit on training set only.
scaler.fit(train_img)

# Apply transform to both the training set and the test set.
train_img = scaler.transform(train_img)
test_img = scaler.transform(test_img)



In [13]:
from sklearn.decomposition import PCA

In [14]:
# instance of model
pca = PCA(.95)

In [15]:
# fit pca on training set
pca.fit(train_img)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [16]:
# no. of components when 95% variance retained
pca.n_components_

330

In [17]:
# apply mapping to both training and test data
train_img = pca.transform(train_img)
test_img = pca.transform(test_img)

In [18]:
# import model to train on dataset after applying pca
from sklearn.linear_model import LogisticRegression

In [19]:
# instance of model
logisticRegr = LogisticRegression(solver = 'lbfgs')

In [20]:
# training
logisticRegr.fit(train_img, train_lbl)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [21]:
# prediction
logisticRegr.predict(test_img[0].reshape(1,-1))

array([1.])

In [22]:
logisticRegr.predict(test_img[0:10])

array([1., 9., 2., 2., 7., 1., 8., 3., 3., 7.])

In [23]:
# measuring performance using accuracy
score = logisticRegr.score(test_img, test_lbl)
print(score)

0.92


In [24]:
# number of components and correponding time taken for training
pd.DataFrame(data = [[1.00, 784, 48.94, .9158],
                     [.99, 541, 34.69, .9169],
                     [.95, 330, 13.89, .92],
                     [.90, 236, 10.56, .9168],
                     [.85, 184, 8.85, .9156]], 
             columns = ['Variance Retained',
                      'Number of Components', 
                      'Time (seconds)',
                      'Accuracy'])

Unnamed: 0,Variance Retained,Number of Components,Time (seconds),Accuracy
0,1.0,784,48.94,0.9158
1,0.99,541,34.69,0.9169
2,0.95,330,13.89,0.92
3,0.9,236,10.56,0.9168
4,0.85,184,8.85,0.9156
