In [None]:
import numpy as np
from matplotlib import pyplot as plt

We load the digits classification dataset from scikit-learn.

In [None]:
from sklearn import datasets

In [None]:
digits = datasets.load_digits()

In [None]:
X = digits.data
y = digits.target

In scikit-learn, the feature vectors are aligned by "data points-by-features": we can see from below that there are 1,797 data points and 64 features. The features are stored in a 2D array; the labels are stored in a 1D array.

In [None]:
X.shape

In [None]:
y.shape

# Fitting a classification model

We first do training-test split based on a given ratio of the test points and a random seed.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

We would use the perceptron classifier.

In [None]:
from sklearn.linear_model import Perceptron

In [None]:
clf = Perceptron(random_state=0) # the perceptron classifier class; not yet fitted on dataset

In [None]:
type(clf)

In [None]:
clf.fit(X_train, y_train)

In [None]:
accuracy = sum(clf.predict(X_test) == y_test) / len(y_test)
print("accuracy: {}".format(accuracy))

# Cross-validation

First, the standard k-fold cross-validation. We provide a number of folds, and whether we would shuffle the data points in data splitting.

In [None]:
from sklearn.model_selection import KFold

In [None]:
kf = KFold(n_splits=5, shuffle=False)
# kf = KFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

We can get the accuracy for each split.

In [None]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(X_train, y_train)
    accuracy = sum(clf.predict(X_test) == y_test) / len(y_test)
    print("accuracy: {}".format(accuracy))    

We here plot the histogram of labels, from both the entire digits dataset and the test dataset of one of the folds. 

In [None]:
# on entire dataset
plt.hist(y, rwidth=0.5, bins=np.arange(11), align='left')
plt.xticks(np.arange(10))
plt.xlabel("digit")
plt.ylabel("count")
plt.title("all labels")

In [None]:
# the test dataset of the last fold
plt.hist(y_test, rwidth=0.5, bins=np.arange(11), align='left')
plt.xticks(np.arange(10))
plt.xlabel("digit")
plt.ylabel("count")
plt.title("test labels in the last fold")

If the dataset is unbalanced, the histogram of the test labels of a specific fold may be different from the histogram of all data points. This may affect the prediction accuracy and thus affect your model selection.

In [None]:
X_no_zero = digits.data[digits.target != 0]
y_no_zero = digits.target[digits.target != 0]

In [None]:
num_zero = sum(digits.target == 0)
sampling_ratio = 5
X_zero = digits.data[digits.target == 0]
y_zero = digits.target[digits.target == 0]
X_zero = X_zero[:(num_zero // sampling_ratio), :]
y_zero = y_zero[:(num_zero // sampling_ratio)]

In [None]:
X = np.vstack((X_no_zero, X_zero))
y = np.concatenate([y_no_zero, y_zero])

In [None]:
plt.hist(y, rwidth=0.5, bins=np.arange(11), align='left')
plt.xticks(np.arange(10))
plt.xlabel("digit")
plt.ylabel("count")
plt.title("all labels")

In [None]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(X_train, y_train)
    accuracy = sum(clf.predict(X_test) == y_test) / len(y_test)
    print("accuracy: {}".format(accuracy))

In [None]:
plt.hist(y_test, rwidth=0.5, bins=np.arange(11), align='left')
plt.xticks(np.arange(10))
plt.xlabel("digit")
plt.ylabel("count")
plt.title("test labels in the last fold")

Instead, we usually do stratified k-fold cross-validation on unbalanced datasets.

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
skf = StratifiedKFold(n_splits=5)

In [None]:
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(X_train, y_train)
    accuracy = sum(clf.predict(X_test) == y_test) / len(y_test)
    print("accuracy: {}".format(accuracy))

In [None]:
plt.hist(y_test, rwidth=0.5, bins=np.arange(11), align='left')
plt.xticks(np.arange(10))
plt.xlabel("digit")
plt.ylabel("count")
plt.title("test labels in the last fold")

We can see that the test dataset in the last fold of the stratified cross-validation has roughly the same distribution as the entire (modified) dataset. 

# Preprocessing

In [None]:
from sklearn.preprocessing import scale, OneHotEncoder

scaling: scales each feature to 0 mean and unit variance

In [None]:
A = np.array([[ 1., -1.,  2.],
              [ 2.,  0.,  0.],
              [ 0.,  1., -1.]])
A = scale(A)
A

one-hot encoding: on columns (i.e., features)

In [None]:
B = np.array([[1, 2, 0, 0, 1]]).T
B

In [None]:
enc = OneHotEncoder()
enc.fit_transform(B)

In [None]:
enc.fit_transform(B).todense()

There are more preprocessors available, including feature selectors and dimensionality reducers. Check out the following webpages to find out!

- Feature selection: https://scikit-learn.org/stable/modules/feature_selection.html
- Dimensionality reduction: https://scikit-learn.org/stable/modules/unsupervised_reduction.html

For visualizations using scikit-learn, you may check out https://scikit-learn.org/stable/visualizations.html. Alternatively, you can always use the scikit-learn API to do model fitting, and use matplotlib for plotting!