# Numpy, Matplotlib and Sklearn Tutorial

#### Q1:
Please use the logistic regression(default parameters) in sklearn to classify the data above, and print the training accuracy and test accuracy.

In [0]:
from sklearn.datasets import fetch_mldata
# split data to train and test (for faster calculation, just use 1/10 data)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score

# download and read mnist
mnist = fetch_mldata('MNIST original', data_home='./')

# 'mnist.data' is 70k x 784 array, each row represents the pixels from a 28x28=784 image
# 'mnist.target' is 70k x 1 array, each row represents the target class of the corresponding image
images = mnist.data
targets = mnist.target

# make the value of pixels from [0, 255] to [0, 1] for further process
X = mnist.data / 255.
Y = mnist.target

X_train, X_test, Y_train, Y_test = train_test_split(X[::10], Y[::10], test_size=1000)

clf=LogisticRegression();
clf.fit(X_train,Y_train)

prediction_train=clf.predict(X_train)
train_accuracy=accuracy_score(Y_train,prediction_train)
prediction_test=clf.predict(X_test)
test_accuracy=accuracy_score(Y_test,prediction_test)

print('Training accuracy: %0.2f%%' % (train_accuracy*100))
print('Testing accuracy: %0.2f%%' % (test_accuracy*100))

Training accuracy: 95.67%

Testing accuracy: 90.40%

#### Q2:
Please use the naive bayes(Bernoulli, default parameters) in sklearn to classify the data above, and print the training accuracy and test accuracy.

In [0]:
# TODO:use naive bayes
from sklearn.naive_bayes import BernoulliNB
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

mnist = fetch_mldata('MNIST original', data_home='./')

# 'mnist.data' is 70k x 784 array, each row represents the pixels from a 28x28=784 image
# 'mnist.target' is 70k x 1 array, each row represents the target class of the corresponding image
images = mnist.data
targets = mnist.target

# make the value of pixels from [0, 255] to [0, 1] for further process
X = mnist.data / 255.
Y = mnist.target

X_train, X_test, Y_train, Y_test = train_test_split(X[::10], Y[::10], test_size=1000)
clf=BernoulliNB();
clf.fit(X_train,Y_train)

prediction_train=clf.predict(X_train)
train_accuracy=accuracy_score(Y_train,prediction_train)
prediction_test=clf.predict(X_test)
test_accuracy=accuracy_score(Y_test,prediction_test)

print('Training accuracy: %0.2f%%' % (train_accuracy*100))
print('Testing accuracy: %0.2f%%' % (test_accuracy*100))

 Training accuracy: 83.80%
 
 Testing accuracy: 84.50%

#### Q3:
Please use the support vector machine(default parameters) in sklearn to classify the data above, and print the training accuracy and test accuracy.

In [0]:
# TODO:use support vector machine
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

mnist = fetch_mldata('MNIST original', data_home='./')

# 'mnist.data' is 70k x 784 array, each row represents the pixels from a 28x28=784 image
# 'mnist.target' is 70k x 1 array, each row represents the target class of the corresponding image
images = mnist.data
targets = mnist.target

# make the value of pixels from [0, 255] to [0, 1] for further process
X = mnist.data / 255.
Y = mnist.target

X_train, X_test, Y_train, Y_test = train_test_split(X[::10], Y[::10], test_size=1000)

lsvc=LinearSVC()
lsvc.fit(X_train,Y_train)

prediction_train=lsvc.predict(X_train)
train_accuracy=accuracy_score(Y_train,prediction_train)
prediction_test=lsvc.predict(X_test)
test_accuracy=accuracy_score(Y_test,prediction_test)

print('Training accuracy: %0.2f%%' % (train_accuracy*100))
print('Testing accuracy: %0.2f%%' % (test_accuracy*100))

Training accuracy: 98.38%

Testing accuracy: 85.50%

#### Q4:
Please adjust the parameters of SVM to increase the testing accuracy, and print the training accuracy and test accuracy.

In [0]:
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

mnist = fetch_mldata('MNIST original', data_home='./')

# 'mnist.data' is 70k x 784 array, each row represents the pixels from a 28x28=784 image
# 'mnist.target' is 70k x 1 array, each row represents the target class of the corresponding image
images = mnist.data
targets = mnist.target

# make the value of pixels from [0, 255] to [0, 1] for further process
X = mnist.data / 255.
Y = mnist.target

X_train, X_test, Y_train, Y_test = train_test_split(X[::10], Y[::10], test_size=1000)

lsvc=LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=1e-05, verbose=0)
lsvc.fit(X_train,Y_train)

prediction_train=lsvc.predict(X_train)
train_accuracy=accuracy_score(Y_train,prediction_train)
prediction_test=lsvc.predict(X_test)
test_accuracy=accuracy_score(Y_test,prediction_test)

print('Training accuracy: %0.2f%%' % (train_accuracy*100))
print('Testing accuracy: %0.2f%%' % (test_accuracy*100))

Training accuracy: 92.85%

Testing accuracy: 91.20%