In [104]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# import warnings
import warnings
# filter warnings
warnings.filterwarnings('ignore')

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

## Importing Libraries an Dataset 

In [105]:
# read train 
trainPCA = pd.read_csv("../input/digit-recognizer/train.csv")
testPCA = pd.read_csv("../input/digit-recognizer/test.csv")

print(f"Shape of train dataframe: {trainPCA.shape}")
print(f"Shape of test dataframe: {testPCA.shape}")

In [106]:

import pandas as pd
import numpy as np

import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras import metrics
from tensorflow.keras import backend as K

## Dataset Overview

In [107]:
trainPCA.head()

In [108]:
trainPCA.describe()

## Preprocessing

### Data Cleaning and Normalization

In [109]:
trainPCA.isna().any().any()

NOTE:
Data is totally clean in this case (since the final result says False which means it has no missing values)
There is no empty field. Data is clean already.

### Data Normalization

In [110]:
# dividing the data into the input and output features to train make the model learn based on what to take in and what to throw out.
mnist_train_data = trainPCA.loc[:, "pixel0":]
mnist_train_label = trainPCA.loc[:, "label"]

# Notmailzing the images array to be in the range of 0-1 by dividing them by the max possible value. 
# Here is it 255 as we have 255 value range for pixels of an image. 
mnist_train_data = mnist_train_data/255.0
mnist_test = testPCA/255.0

In [111]:
mnist_train_data

In [112]:
mnist_test

### Visulaize a single digit with an array

In [113]:
digit_array = trainPCA.loc[3, "pixel0":]
arr = np.array(digit_array) 

#.reshape(a, (28,28))
image_array = np.reshape(arr, (28,28))

digit_img = plt.imshow(image_array, cmap=plt.cm.binary)
plt.colorbar(digit_img)
print("IMAGE LABEL: {}".format(trainPCA.loc[3, "label"]))

# Part I - PCA Implementation on MNIST Digits


### Manual :

a) Compute standardization of data

In [114]:
from sklearn.preprocessing import StandardScaler

standardized_scalar = StandardScaler()
standardized_data_train = standardized_scalar.fit_transform(mnist_train_data)
standardized_data_train.shape

b) Calculate covariance matrix S(dxd)

In [115]:

cov_matrix = np.matmul(standardized_data_train.T, standardized_data_train)
cov_matrix.shape

c) Calculate Eigen values and eigen vectors

In [116]:
from scipy.linalg import eigh

lambdas, vectors = eigh(cov_matrix, eigvals=(782, 783))
vectors.shape

In [117]:
vectors = vectors.T
vectors.shape

d) Calculate unit vectors U1=V1 and new coordinates

In [118]:
new_coordinates = np.matmul(vectors, standardized_data_train.T)
print(new_coordinates.shape)
new_coordinates = np.vstack((new_coordinates, mnist_train_label)).T

In [119]:
mnist_train_data

In [120]:
df_new = pd.DataFrame(new_coordinates, columns=["f1", "f2", "labels"])
df_new.head()

e) Plot FacetGrid using seaborn

In [121]:
sns.FacetGrid(df_new, hue="labels", size=6).map(plt.scatter, "f1", "f2").add_legend()
plt.show()

### SKlearn :

In [122]:
from sklearn import decomposition

pca = decomposition.PCA()
pca.n_components = 150
pca_data = pca.fit_transform(standardized_data_train)
pca_data.shape

In [123]:

X = pca_data
y = mnist_train_label
pca_data = np.vstack((pca_data.T, mnist_train_label)).T

In [124]:
X

In [125]:
pca_data

In [126]:
df_PCA = pd.DataFrame(new_coordinates, columns=["f1", "f2", "labels"])
df_PCA.head()

In [127]:
sns.FacetGrid(df_new, hue="labels", size=12).map(plt.scatter, "f1", "f2").add_legend()
plt.savefig("PCA_FacetGrid.png")
plt.show()

### PCA Dimension Reduction

In [128]:
pca.n_components = 784
pca_data = pca.fit_transform(standardized_data_train)
percent_variance_retained = pca.explained_variance_ / np.sum(pca.explained_variance_)

cum_variance_retained = np.cumsum(percent_variance_retained)

In [129]:
plt.figure(1, figsize=(10, 6))
plt.clf()
plt.plot(cum_variance_retained, linewidth=2)
plt.axis("tight")
plt.grid()
plt.xlabel("number of compoments")
plt.ylabel("cumulative variance retained")
plt.savefig("pca_cumulative_variance.png")
plt.show()

In [130]:
pca_data

## Models

In [131]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn import svm, metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier, plot_importance 
from sklearn.naive_bayes import GaussianNB

In [132]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.15)

clf_rf = RandomForestClassifier()
clf_dt = DecisionTreeClassifier()
clf_knn = KNeighborsClassifier()
clf_nb = GaussianNB()
clf_lr = LogisticRegression()
clf_svm = SVC() #RBF

In [133]:
Classifiers = ['RandomForest','DecisionTree','KNN','Naive Bayes','LogisticRegression','SVM']
scores = []
models = [clf_rf, clf_dt, clf_knn, clf_nb, clf_lr,clf_svm]
for model in models:
    score = cross_val_score(model, X_train, y_train, scoring = 'accuracy', cv = 10, n_jobs = -1).mean()
    scores.append(score)

In [134]:
mode = pd.DataFrame(scores, index = Classifiers, columns = ['score']).sort_values(by = 'score',
             ascending = False)

In [135]:
mode 

In [136]:
#the accuracy score of the model
clf_svm = SVC()
clf_svm.fit(X_train,y_train)
y_train
