In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics

## Load the data

In [None]:
iris = datasets.load_iris()

In [None]:
# print(iris.DESCR)

In [None]:
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)

# add the target column into iris_df['Class']

# add target names column into iris_df['ClassName']
# note: use df.apply method and function, which return target name for a given target value
# bonus: try to use Python lambda function into apply


iris_df.head()

## Get insight of the data (Prepare and clean)

In [None]:
# lets check how many rows we have per each class:
values, counts = np.unique(iris_df.Class, return_counts=True)
print('values: {}, counts: {}'.format(values, counts))

# or you can use the more visual:
# iris_df.groupby('ClassName').count()

### Correlation matrix

In [None]:
iris_df.corr()

## Select features

In [None]:
# we will use the first two columns for features (they are not the best choice, we just want to test the algorithm)
f1 = 0
f2 = 1

# put in X the first two columns of iris_df

print(f'X shape: {X.shape}')

# put in y the 'Class' column of iris_df

print(f'y shape: {y.shape}')

## Visualize the data

In [None]:
def plot_data_set(X,y):    
    classes = np.unique(y)
    for c in classes:
        X_new = X[y==c]
        plt.scatter(X_new.iloc[:, 0], X_new.iloc[:, 1], 
                    edgecolors='gray', 
                    label = classes[c],
                    cmap=plt.cm.coolwarm, alpha=0.7
        )

        plt.xlabel('X[0]')
        plt.ylabel('X[1]')
        plt.legend(loc='best')


In [None]:
plt.figure(dpi=100)
plot_data_set(X,y)

Yes, our features are not good enough, but let's just keep playing...

In [None]:
# let's reserv 20% of the data for test:

print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

## Choose the model

We want to test the svm.SVC model, with kernel="lenear", and C=0.05:
https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [None]:
# instantiate the classifier in clf 

## Train

In [None]:
clf.fit(X_train, y_train)

### inspect the model

In [None]:
# get support vectors
sv = clf.support_vectors_
print(f'SV shape: {sv.shape}')

In [None]:
# plot the data:
plot_data_set(X,y)

# plot the SV:
plt.scatter(sv[:,0], sv[:,1],c="red")

In [None]:
def plot_margin(clf, X):
    # get the separating hyperplane
    w = clf.coef_[0]
    a = -w[0] / w[1]
    xx = np.linspace(-5, 5)
    yy = a * xx - (clf.intercept_[0]) / w[1]

    # plot the parallels to the separating hyperplane that pass through the
    # support vectors (margin away from hyperplane in direction
    # perpendicular to hyperplane). This is sqrt(1+a^2) away vertically in
    # 2-d.
    margin = 1 / np.sqrt(np.sum(clf.coef_ ** 2))
    yy_down = yy - np.sqrt(1 + a ** 2) * margin
    yy_up = yy + np.sqrt(1 + a ** 2) * margin

    # plot the line, the points, and the nearest vectors to the plane
    # fig, ax = plt.subplots()  #create figure and axes
    plt.figure(1, figsize=(8, 5))
    plt.clf()
    plt.plot(xx, yy, 'k-')
    plt.plot(xx, yy_down, 'k--')
    plt.plot(xx, yy_up, 'k--')

    plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=80,
                facecolors='none', zorder=10, edgecolors='k')
    plt.scatter(X[:, 0], X[:, 1], zorder=10, cmap=plt.cm.Paired, edgecolors='k')

    plt.axis('tight')
    x_min = -4.8
    x_max = 4.2
    y_min = -6
    y_max = 6

    XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
    Z = clf.predict(np.c_[XX.ravel(), YY.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(XX.shape)
    plt.pcolormesh(XX, YY, Z, cmap=plt.cm.Pastel1)

    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)

    plt.xticks(())
    plt.yticks(())    
    
    ax = plt.gca()
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_title('Figure 1')
    
    plt.legend(['boundary', 'margin = {:.2f}'.format(margin)], 
               bbox_to_anchor=(1,1), 
               loc="upper left")

plt.show()

In [None]:
plot_margin(clf,X.values)
plt.show()

In [None]:
# plot margin on different penality values
for name, penalty in (('unreg', 1), ('reg', 0.01)):
    # fit the model with each penality value:
    clf = svm.SVC(kernel='linear', C=penalty)
    clf.fit(X, y)
    
    plot_margin(clf, X.values)

    plt.show()

In [None]:
from mlxtend.plotting import plot_decision_regions

plot_decision_regions(X.values, 
                      y.values,
                      clf=clf, 
                      legend=2)

## Predict

In [None]:
# get the predicted values from X_test into y_pred 

## Evaluating the Model

In [None]:
from sklearn import metrics

In [None]:
# Model Accuracy: 
# use: metrics.accuracy_score

In [None]:
# Model Precision:
# use: precision_score

In [None]:
# Model Recall: 
# use recall_score