<a href="https://colab.research.google.com/github/PrincetonUniversity/python_machine_learning/blob/main/notebook2_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

rng = np.random.RandomState(1)
qualitiative_colors = ['#1b9e77','#d95f02','#7570b3','#e7298a']

# Classification

## Make Classification Data Set

In [None]:
from sklearn.datasets import make_classification

In [None]:
#X, y = make_classification(n_samples=200, n_features=2, n_redundant=0, n_informative=2, random_state=100, n_clusters_per_class=1, class_sep=0.75)
X, y = make_classification(n_samples=200, n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1, class_sep=1.5)


n_sample = len(X)

# randomize the data
order = rng.permutation(n_sample)
X = X[order]
y = y[order].astype(float)

# Split the data into training/testing sets
X_train = X[: int(0.8 * n_sample)]
y_train = y[: int(0.8 * n_sample)]
X_test = X[int(0.8 * n_sample) :]
y_test = y[int(0.8 * n_sample) :]

In [None]:
fig, axs = plt.subplots(figsize=(4.,4.), nrows=1, ncols=1, facecolor='white', dpi=200, sharex=True)
axs.scatter(X_train[:, 0], X_train[:, 1], c=y_train, zorder=10, cmap=plt.cm.PRGn, edgecolor="k", s=50)
axs.set_ylabel(r'$x_2$')
axs.set_xlabel(r'$x_1$')

## Support Vector Classification

In [None]:
from sklearn.svm import SVC

### Create Support Vector Classification object

In [None]:
clf = SVC(kernel="linear")

### Train the model using the training set

In [None]:
clf.fit(X_train, y_train)

### Output the best fit values

In [None]:
print("Intercept: \n", clf.intercept_)
print("Coefficients: \n", clf.coef_)

### Create a grid over the feature space

In [None]:
x_min = X[:, 0].min()-0.1
x_max = X[:, 0].max()+0.1
y_min = X[:, 1].min()-0.1
y_max = X[:, 1].max()+0.1

XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]

### Make predictions at each point in the feature 

In [None]:
Z = clf.decision_function(np.c_[XX.ravel(), YY.ravel()])
Z = Z.reshape(XX.shape)
Z_lim = max([-1*Z.min(), Z.max()])

### Make predictions using the test set

In [None]:
predicted_class = clf.predict(X_test)

In [None]:
fig, axs = plt.subplots(figsize=(4.,4.), nrows=1, ncols=1, facecolor='white', dpi=200, sharex=True)
edgecolors = plt.cm.PRGn(predicted_class)
axs.scatter(X_test[:, 0], X_test[:, 1], s=80, c=y_test, zorder=10, cmap=plt.cm.PRGn, alpha=0.5, edgecolor=edgecolors)
axs.pcolormesh(XX, YY, Z, cmap=plt.cm.PRGn, shading='auto', edgecolors=None, alpha=0.5, snap=True, vmin=-1*Z_lim, vmax=Z_lim)
axs.contour(XX, YY, Z, colors=["k", "k", "k"], linestyles=["--", "-", "--"], levels=[-0.5, 0, 0.5])
axs.set_ylabel(r'$x_2$')
axs.set_xlabel(r'$x_1$')

### Create Support Vector Classification object **with poly kernel**

In [None]:
clf = SVC(kernel="poly")

### Train the model using the training set

In [None]:
clf.fit(X_train, y_train)

### Make predictions at each point in the feature 

In [None]:
Z = clf.decision_function(np.c_[XX.ravel(), YY.ravel()])
Z = Z.reshape(XX.shape)
Z_lim = max([-1*Z.min(), Z.max()])

In [None]:
fig, axs = plt.subplots(figsize=(4.,4.), nrows=1, ncols=1, facecolor='white', dpi=200, sharex=True)
axs.scatter(X_train[:, 0], X_train[:, 1], s=40, c=y_train, zorder=10, cmap=plt.cm.PRGn, alpha=0.5, edgecolor='k')
axs.pcolormesh(XX, YY, Z, cmap=plt.cm.PRGn, shading='auto', edgecolors=None, alpha=0.5, snap=True, vmin=-1*Z_lim, vmax=Z_lim)
axs.contour(XX, YY, Z, colors=["k", "k", "k"], linestyles=["--", "-", "--"], levels=[-0.5, 0, 0.5])
axs.set_ylabel(r'$x_2$')
axs.set_xlabel(r'$x_1$')

### Make predictions using the test set

In [None]:
predicted_class = clf.predict(X_test)

In [None]:
fig, axs = plt.subplots(figsize=(4.,4.), nrows=1, ncols=1, facecolor='white', dpi=200, sharex=True)
edgecolors = plt.cm.PRGn(predicted_class)
axs.scatter(X_test[:, 0], X_test[:, 1], s=60, c=y_test, zorder=10, cmap=plt.cm.PRGn, alpha=0.5, edgecolor=edgecolors)
axs.pcolormesh(XX, YY, Z, cmap=plt.cm.PRGn, shading='auto', edgecolors=None, alpha=0.5, snap=True, vmin=-1*Z_lim, vmax=Z_lim)
axs.contour(XX, YY, Z, colors=["k", "k", "k"], linestyles=["--", "-", "--"], levels=[-0.5, 0, 0.5])
axs.set_ylabel(r'$x_2$')
axs.set_xlabel(r'$x_1$')

## k-Nearest Neighbors Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

### Create Support Vector Classification object

In [None]:
number_of_neighbors = 5
clf =  KNeighborsClassifier(number_of_neighbors)

### Train the model using the training set

In [None]:
clf.fit(X_train, y_train)

In [None]:
fig, axs = plt.subplots(figsize=(4.,4.), nrows=1, ncols=1, facecolor='white', dpi=200, sharex=True)
axs.scatter(X_train[:, 0], X_train[:, 1], c=y_train, zorder=10, cmap=plt.cm.Dark2, edgecolor="k", s=20)
axs.set_ylabel(r'$x_2$')
axs.set_xlabel(r'$x_1$')

### Make predictions using the test set

In [None]:
fig, axs = plt.subplots(figsize=(4.,4.), nrows=1, ncols=1, facecolor='white', dpi=200, sharex=True)
axs.scatter(X_train[:, 0], X_train[:, 1], c=y_train, zorder=10, cmap=plt.cm.Dark2, edgecolor="k", s=20)
for i in range(X_test.shape[0]):
    test_coords = X_test[i,:]
    neighbors = clf.kneighbors(test_coords[np.newaxis,:], 5,  return_distance=False)[0]
    predicted_class = clf.predict(test_coords[np.newaxis,:])
    axs.scatter(test_coords[0], test_coords[1], s=40, color=plt.cm.Dark2(predicted_class), zorder=10, edgecolor='None')
    for j in range(len(neighbors)):
        axs.plot([X_train[neighbors[j], 0], test_coords[0]], [X_train[neighbors[j], 1], test_coords[1]], '-', color=qualitiative_colors[1])
axs.set_xlim(-1,4)
axs.set_ylim(-2.1,2.15)
axs.set_ylabel(r'$x_2$')
axs.set_xlabel(r'$x_1$')

## Exercise

Take a look at the iris plants dataset (https://scikit-learn.org/stable/datasets/toy_dataset.html). Train a classifier using the k-nearest neighbors algorithm on the iris dataset.

In [None]:
from sklearn import datasets
import itertools

# Load the dataset
dataset = datasets.load_iris(as_frame=True)

# Separate out the data
X = dataset['data']
y = dataset['target']

# Translate the target
y = y.apply(lambda x: dataset['target_names'][x])

# Plot
fig, ax = plt.subplots(3, 2, figsize=(8, 10))
ax = ax.flatten()
fig.suptitle('Iris plants dataset')
for i, combination in enumerate(itertools.combinations(list(X), 2)):
    col1, col2 = combination
    for species in dataset['target_names']:
        df = X[y == species]
        ax[i].scatter(df[col1], df[col2], label=species)
    ax[i].set_xlabel(col1)
    ax[i].set_ylabel(col2)
    ax[i].legend()
plt.tight_layout()
plt.show()

In [None]:
# train a kNN classifier on the iris dataset