In [2]:
from PIL import Image
import numpy as np
import pandas as pd
from random import *
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import datetime

# 0.a Read Dataset

In [3]:
# Check time needed
start_time = datetime.datetime.now()

# Import data
df_train = pd.read_csv("mnist_train.csv", header = None)
df_test = pd.read_csv("mnist_test.csv", header = None)

# Display time needed
end_time = datetime.datetime.now() - start_time
print("time needed to load data : {}".format(end_time))

time needed to load data : 0:00:03.961297


In [35]:
print(len(df_train))
print(len(df_test))

60000
10000


In [8]:
# Valeurs des pixels
df_train.sample(n=3)[range(200, 210)]

Unnamed: 0,200,201,202,203,204,205,206,207,208,209
39710,0,0,0,0,0,0,63,175,252,252
11684,0,0,0,0,0,0,0,84,254,254
49141,0,0,0,0,0,0,0,0,0,0


In [9]:
# Valeurs du chiffre sur la photo
df_train.sample(n=3)[range(200, 210)]

Unnamed: 0,200,201,202,203,204,205,206,207,208,209
580,0,0,0,0,0,0,0,0,0,6
43314,0,0,0,59,253,184,11,0,0,0
49293,0,0,0,0,0,0,0,0,131,222


# 0.b Check one image

In [18]:
# np.array(l).reshape(shape, shape)

In [16]:
# Row number in test
N = 165

# All columns for this row
l = df_test[[x for x in range(1,785)]].iloc[N].tolist()

# Shape of image (sqrt of #columns because square image)
shape = int(np.sqrt(784))

# Create array
data1 = np.array(l).reshape(shape, shape)

# Image from PIL
img = Image.fromarray(np.uint8(data1 * 255) , 'L')

# Display
img.show()

# 0.c Reformat train and test

In [20]:
# Concat
df = pd.concat([df_train,df_test], axis=0)

# Columns
cols = [x for x in range(1,785)]

# Data X
mnist_data = df[cols]

# Data Y (target)
mnist_target = df[0]

# New trains/test
x_train, x_test, y_train, y_test = train_test_split(
    mnist_data, mnist_target, test_size=1/7.0, random_state=0
)

# 1.a Naïves Bayes

In [22]:
# NB
clf = MultinomialNB()

# Check time
start_time = datetime.datetime.now()

# Fit
clf.fit(x_train,y_train)

# End time
end_time = datetime.datetime.now() - start_time

# Display time
print("time needed to fit model date : {}".format(end_time))

# Perform the predictions
y_predicted = clf.predict(x_test)

# Calculate the accuracy of the prediction
print("Accuracy = {}".format(accuracy_score(y_test, y_predicted)*100))

# Confusion matrix
print("Classification Report \n {}".format(
    classification_report(y_test, y_predicted, labels=range(0,10)))
     )
confusion_matrix(y_test, y_predicted)

time needed to fit model date : 0:00:01.331956
Accuracy = 82.02000000000001
Classification Report 
               precision    recall  f1-score   support

           0       0.92      0.91      0.92       996
           1       0.89      0.94      0.91      1141
           2       0.86      0.82      0.84      1040
           3       0.78      0.79      0.79      1013
           4       0.83      0.75      0.79       962
           5       0.86      0.64      0.73       863
           6       0.88      0.91      0.90       989
           7       0.94      0.81      0.87      1064
           8       0.63      0.76      0.69       963
           9       0.68      0.82      0.74       969

    accuracy                           0.82     10000
   macro avg       0.83      0.82      0.82     10000
weighted avg       0.83      0.82      0.82     10000



array([[ 906,    0,    5,    5,    1,   11,   15,    0,   53,    0],
       [   0, 1075,    8,    9,    2,    2,    5,    0,   38,    2],
       [  15,   11,  854,   22,    7,    0,   53,   10,   65,    3],
       [   3,   21,   57,  805,    0,   20,    6,   12,   49,   40],
       [   0,    1,    4,    0,  723,    5,   17,    2,   42,  168],
       [  27,    6,   14,  113,   21,  549,   21,    3,   84,   25],
       [  12,   18,   15,    0,    6,   19,  904,    0,   15,    0],
       [   7,   14,   12,    7,   35,    0,    1,  861,   46,   81],
       [   3,   53,   14,   62,   15,   26,    8,    2,  732,   48],
       [   8,   14,    6,   13,   61,    3,    0,   25,   46,  793]])

# 1.b Regression logistique

In [23]:
len(x_test)

10000

In [24]:
len(x_train)

60000

In [26]:
# LR
clf = LogisticRegression(solver='lbfgs')

# Check time
start_time = datetime.datetime.now()

# Fit
clf.fit(x_train,y_train)

# End time
end_time = datetime.datetime.now() - start_time

# Display time
print("time needed to fit model date : {}".format(end_time))

# Perform the predictions
y_predicted = clf.predict(x_test)

# Calculate the accuracy of the prediction
print("Accuracy = {}".format(accuracy_score(y_test, y_predicted)*100))

# Confusion matrix
print("Classification Report \n {}".format(classification_report(y_test, y_predicted, labels=range(0,10))))
confusion_matrix(y_test, y_predicted)

time needed to fit model date : 0:00:18.173359
Accuracy = 91.84
Classification Report 
               precision    recall  f1-score   support

           0       0.97      0.97      0.97       996
           1       0.95      0.98      0.97      1141
           2       0.91      0.90      0.90      1040
           3       0.92      0.88      0.90      1013
           4       0.91      0.93      0.92       962
           5       0.89      0.86      0.88       863
           6       0.94      0.95      0.94       989
           7       0.93      0.92      0.92      1064
           8       0.87      0.89      0.88       963
           9       0.89      0.89      0.89       969

    accuracy                           0.92     10000
   macro avg       0.92      0.92      0.92     10000
weighted avg       0.92      0.92      0.92     10000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([[ 968,    0,    1,    1,    2,   10,    8,    2,    4,    0],
       [   0, 1115,    6,    2,    1,    4,    0,    4,    7,    2],
       [   4,   12,  933,   15,   12,    1,   15,    9,   35,    4],
       [   1,    5,   35,  892,    0,   25,    3,   12,   27,   13],
       [   1,    2,    5,    0,  898,    1,   11,   10,    6,   28],
       [   8,    2,    8,   31,    9,  746,   13,    5,   33,    8],
       [   8,    3,    8,    0,   11,   17,  936,    2,    4,    0],
       [   2,    5,   15,    4,   11,    2,    1,  980,    4,   40],
       [   4,   20,   11,   18,    7,   28,    9,    2,  854,   10],
       [   4,    4,    5,   10,   34,    8,    2,   32,    8,  862]])

In [19]:
sum((len(clf.coef_[x]) for x in range(len(clf.coef_))))

7840

## 2.a Scaling 
    - https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60

StandardScaler()

# 2.b Réduction de la dimension

In [33]:
# x_train
x_train
len(train_img[0])

784

In [37]:
# New trains/test
x_train, x_test, y_train, y_test = train_test_split(
    mnist_data, mnist_target, test_size=1/7.0, random_state=0
)

# Standard Scaler
scaler = StandardScaler()

# Fit on training set only.
scaler.fit(x_train)

# Apply transform to both the training set and the test set.
train_img = scaler.transform(x_train)
test_img = scaler.transform(x_test)

# Make an instance of the Model
pca = PCA(.95)

# Fit 
pca.fit(train_img)

# Transform both
x_train = pca.transform(train_img)
x_test = pca.transform(test_img)

In [38]:
len(x_train[0])

327

# 3.a Logistic Regression (après réduction de la dimension)

In [39]:
# LR
clf = LogisticRegression(solver = 'lbfgs')

# Check time
start_time = datetime.datetime.now()

# Fit
clf.fit(x_train,y_train)

# End time
end_time = datetime.datetime.now() - start_time

# Display time
print("time needed to fit model date : {}".format(end_time))

# Perform the predictions
y_predicted = clf.predict(x_test)

# Calculate the accuracy of the prediction
print("Accuracy = {}".format(accuracy_score(y_test, y_predicted)*100))

# Confusion matrix
print("Classification Report \n {}".format(classification_report(y_test, y_predicted, labels=range(0,10))))
confusion_matrix(y_test, y_predicted)

time needed to fit model date : 0:00:08.060889
Accuracy = 92.01
Classification Report 
               precision    recall  f1-score   support

           0       0.97      0.97      0.97       996
           1       0.95      0.97      0.96      1141
           2       0.90      0.90      0.90      1040
           3       0.91      0.88      0.90      1013
           4       0.91      0.93      0.92       962
           5       0.88      0.88      0.88       863
           6       0.94      0.95      0.95       989
           7       0.93      0.92      0.93      1064
           8       0.89      0.89      0.89       963
           9       0.89      0.90      0.89       969

    accuracy                           0.92     10000
   macro avg       0.92      0.92      0.92     10000
weighted avg       0.92      0.92      0.92     10000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([[ 967,    0,    2,    1,    1,   10,    9,    0,    5,    1],
       [   0, 1109,    8,    1,    1,    7,    0,    3,   10,    2],
       [   6,   16,  931,   19,   13,    3,   13,    9,   26,    4],
       [   1,    6,   38,  894,    1,   29,    1,   11,   19,   13],
       [   1,    3,    6,    0,  898,    0,   12,    9,    5,   28],
       [   7,    2,   10,   30,    8,  758,   14,    2,   26,    6],
       [   8,    2,    7,    0,   13,   14,  938,    2,    5,    0],
       [   2,    5,   15,    2,   13,    4,    0,  980,    5,   38],
       [   3,   19,    7,   20,    6,   24,    8,    3,  858,   15],
       [   3,    3,    5,   11,   30,    8,    1,   32,    8,  868]])

In [24]:
len(clf.coef_[0])

327

# 4.a Cross Validation - Naïve Bayes

In [40]:
from sklearn.model_selection import cross_val_score
clf = MultinomialNB()
scores_nb = cross_val_score(clf, mnist_data, mnist_target, cv=10)

In [26]:
pd.DataFrame(scores_nb).describe()

Unnamed: 0,0
count,10.0
mean,0.825729
std,0.012631
min,0.811571
25%,0.818821
50%,0.823214
75%,0.830179
max,0.853429


# 4.b Cross Validation - Logistic Regression

In [41]:
from sklearn.model_selection import cross_val_score
clf = LogisticRegression(solver = 'lbfgs')
scores = cross_val_score(clf, mnist_data, mnist_target, cv=10)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [28]:
pd.DataFrame(scores).describe()

Unnamed: 0,0
count,10.0
mean,0.921043
std,0.00597
min,0.913429
25%,0.917214
50%,0.919929
75%,0.924679
max,0.931286


# 4.b Cross Validation - Logistic Regression after Scaling + PCA

In [29]:
variance_kept = .95

In [30]:
scaler = StandardScaler()

# Fit on training set only.
scaler.fit(mnist_data)

# Apply transform to both the training set and the test set.
data_img = scaler.transform(mnist_data)

# PCA
pca = PCA(variance_kept)
pca.fit(data_img)
X = pca.transform(data_img)

# Logistic Regression
clf = LogisticRegression(solver = 'lbfgs')
scores_lr = cross_val_score(clf, X, mnist_target, cv=10)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [31]:
pd.DataFrame(scores_lr).describe()

Unnamed: 0,0
count,10.0
mean,0.922429
std,0.005781
min,0.915857
25%,0.918536
50%,0.921786
75%,0.923714
max,0.935


# 4.c Cross Validation - Logistic Regression after Scaling + PCA (Plusieurs variances gardées différentes)

In [32]:
from pprint import pprint


pprint(scores_lr_d)
d = {key : list(value) for key, value in scores_lr_d.items()}

pprint(d)


d = {key: np.mean(value) for key, value in d.items()}


pprint(d)


NameError: name 'scores_lr_d' is not defined

In [None]:
scores_lr_d = {}

for variance_kept in [0.90, 0.95, 0.96, 0.97, 0.98, 0.99]:
    
    print(variance_kept)
    
    # Scaler
    scaler = StandardScaler()

    # Fit on training set only.
    scaler.fit(mnist_data)

    # Apply transform to both the training set and the test set.
    data_img = scaler.transform(mnist_data)

    # PCA
    pca = PCA(variance_kept)
    pca.fit(data_img)
    X = pca.transform(data_img)

    # Logistic Regression
    clf = LogisticRegression(solver = 'lbfgs')
    scores_lr_ = cross_val_score(clf, X, mnist_target, cv=10)
    scores_lr_d[variance_kept] = scores_lr_

# DEV --  Representation

In [None]:
# all parameters not specified are set to their defaults
# default solver is incredibly slow which is why it was changed to 'lbfgs'
logisticRegr = LogisticRegression(solver = 'lbfgs')

logisticRegr.fit(train_img, train_lbl)

# Predict for One Observation (image)
logisticRegr.predict(test_img[0].reshape(1,-1))

# Predict for One Observation (image)
logisticRegr.predict(test_img[0:10])

logisticRegr.score(test_img, test_lbl)

###https://github.com/mGalarnyk/Python_Tutorials/blob/master/Sklearn/PCA/PCA_Image_Reconstruction_and_such.ipynb

In [None]:
import chainer
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
 
# Load the MNIST dataset from pre-inn chainer method
cols = [x for x in range(1,785)]
train, test = df_train, df_test
 
ROW = 4
COLUMN = 5
for i in range(ROW * COLUMN):
    # train[i][0] is i-th image data with size 28x28
    #plt.figure(figsize=(10,10))
    image = np.array(train.iloc[i][cols]).reshape(28, 28)   # not necessary to reshape if ndim is set to 2
    plt.subplot(ROW, COLUMN, i+1)# subplot with size (width 3, height 5)
    plt.imshow(image, cmap='gray')  # cmap='gray' is for black and white picture.
    # train[i][1] is i-th digit label
    plt.title('label = {}'.format(train.iloc[i][0]))
    plt.axis('off')  # do not show axis value
plt.tight_layout()   # automatic padding between subplots
plt.savefig('mnist_plot.png')
plt.show()