## Chapter 3 -  Classification

In [1]:
import pickle

import pandas as pd
import numpy as np
import numpy.random as rnd
import matplotlib
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

def load(fname):
    import pickle
    mnist = None
    try:
        with open(fname, 'rb') as f:
            mnist = pickle.load(f)
            return mnist
    except FileNotFoundError:
        from sklearn.datasets import fetch_openml
        mnist = fetch_openml('mnist_784', version=1, cache=True)
        with open(fname, 'wb') as f:
            mnist = pickle.dump(mnist, f)
        return mnist

### Classifying with k-Nearest Neighbours (kNN)

In [2]:
mnist_data = load('mnist.data.pkl')
X, y = mnist_data['data'], mnist_data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)
y_train, y_test = y_train.astype(int), y_test.astype(int)

In this case, predict the correct cleaned image from the noisy image.

In [3]:
# Train a kNN classifier
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [9]:
# Predict
y_predict = knn_clf.predict(X_test[:50])
print(y_test[:25])
print(y_predict[:25])

[0 4 1 2 7 9 7 1 1 7 1 3 4 2 6 4 6 7 3 3 7 0 5 7 6]
[0 4 1 2 7 9 7 1 1 7 1 3 4 2 6 4 6 7 3 3 7 0 5 7 6]


### Classifying with Probability Theory: Naïve Bayes

In [5]:
# Ingest
ng = fetch_20newsgroups()
Xng, yng = ng['data'], ng['target']
X_train_ng, X_test_ng, y_train_ng, y_test_ng = train_test_split(Xng, yng, test_size=0.20, random_state=0)

In [6]:
# Transform to vector, and train
vec = TfidfVectorizer()
Xng_matrix = vec.fit_transform(X_train_ng)

nb_clf = MultinomialNB()
nb_clf.fit(Xng_matrix, y_train_ng)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [7]:
# Transform to vector, and predict
Xng_test_matrix = vec.transform(X_test_ng)
y_predict_ng = nb_clf.predict(Xng_test_matrix)

In [8]:
# for testing
print(y_test_ng[:20])
print(y_predict_ng[:20])

[ 1 12 13 14  9  9 11  8 14 11 10  9  4  9  0  9 13 14  7  5]
[ 1  1 13 14  9  9 11  8 14 11  9  9  4  9  0  9 13 14 17  5]
