<a href="https://colab.research.google.com/github/kanacb/machinelearning/blob/main/SVMs_Iris.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import software libraries and load the dataset #

In [None]:
import sys                             # Read system parameters.
import numpy as np                     # Work with multi-dimensional arrays and matrices.
import pandas as pd                    # Manipulate and analyze data.
import matplotlib as mpl               # Create 2D charts.
import matplotlib.pyplot as plt
import seaborn as sb                   # Perform data visualization.
import sklearn                         # Perform data mining and analysis.
from sklearn import datasets

# Summarize software libraries used.
print('Libraries used in this project:')
print('- Python {}'.format(sys.version))
print('- NumPy {}'.format(np.__version__))
print('- pandas {}'.format(pd.__version__))
print('- Matplotlib {}'.format(mpl.__version__))
print('- scikit-learn {}\n'.format(sklearn.__version__))

# Load the dataset.
iris = datasets.load_iris()
print('Loaded {} records.'.format(len(iris.data)))

# Get acquainted with the dataset #

In [None]:
# Convert array to pandas DataFrame.
data_raw = pd.DataFrame(iris['data'], columns = iris['feature_names'])
data_raw['target'] = iris['target']

print(data_raw.info())      # View data types and see if there are missing entries.
data_raw.head(10)           # View first 10 records.

# Examine a general summary of statistics

In [None]:
with pd.option_context('float_format', '{:.2f}'.format): 
    print(data_raw.describe())

# Identify outliers

In [None]:
plt.figure(figsize = (20, 2))
bplot = sb.boxplot(x = 'sepal width (cm)', data = data_raw, orient = 'h', fliersize = 7)

# Reduce the dimensionality of the dataset

In [None]:
X = iris['data'][:, :2]  # Only use first two features (sepal length and sepal width).
y = iris['target']

print("\nBefore reduction:")
print("X dataset dimensions are", X.shape)
print("y dataset dimensions are", y.shape)

# Only use labels 0 and 1 (setosa and versicolor).
class_labels = (y == 0) | (y == 1)
X = X[class_labels]
y = y[class_labels]

print("\nAfter reduction:")
print("X dataset dimensions are", X.shape)
print("y dataset dimensions are", y.shape)

# Examine the separation between classes using a scatter plot

In [None]:
# Sepal length along x-axis, sepal width along y-axis.
scatter_x = X[:, 0]
scatter_y = X[:, 1]

cdict = {0: 'green', 1: 'grey'}

# Generate scatter plot with legend.
for c_label in np.unique(y):
    if c_label == 0:
        iris = 'setosa'
    if c_label == 1:
        iris = 'versicolor'
    
    ix = np.where(y == c_label)
    plt.scatter(scatter_x[ix], scatter_y[ix], c = cdict[c_label], label = iris, s = 40)
    
plt.legend()
plt.xlabel("Sepal length", fontsize = 13)
plt.ylabel("Sepal width", fontsize = 13)
plt.annotate('Possible outlier', xy = (4.4, 2.3), xytext = (2.9, 2.2),
             arrowprops = dict(color= 'black'), fontsize = 15);

# Plot a decision boundary for a given model

In [None]:
def plot_decision_boundary(X, y, model, is_svm):
    scatter_x = X[:, 0]
    scatter_y = X[:, 1]

    cdict = {0: 'green', 1: 'grey'}

    for c_label in np.unique(y):
        if c_label == 0:
            iris = 'setosa'
        if c_label == 1:
            iris = 'versicolor'

        ix = np.where(y == c_label)
        plt.scatter(scatter_x[ix], scatter_y[ix], c = cdict[c_label], label = iris, s = 40)
        
    plt.legend()
    plt.xlabel("Sepal length", fontsize = 13)
    plt.ylabel("Sepal width", fontsize = 13)

    ax = plt.gca()
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()

    # Create grid.
    xx = np.linspace(xlim[0], xlim[1], 40)
    yy = np.linspace(ylim[0], ylim[1], 40)
    YY, XX = np.meshgrid(yy, xx)
    xy = np.vstack([XX.ravel(), YY.ravel()]).T
    Z = model.decision_function(xy).reshape(XX.shape)  # Use model decision function to plot boundary.
    
    if is_svm == True:
        # Plot decision boundary and margins.
        ax.contour(XX, YY, Z, colors = 'r', levels = [-1, 0, 1], 
                   linestyles=['--', '-', '--'])
        
        # Plot support vectors.
        ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1],
                   s = 100, linewidth = 1, facecolors = 'none', edgecolors = 'k')
    else:
        ax.contour(XX, YY, Z, colors = 'r', levels = [0], 
                   linestyles=['-'])
        
    plt.show()
    
print('Function to plot the decision boundary has been defined.')

# Train a basic logistic regression model and plot its decision boundary

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(solver = 'liblinear', random_state = 1936)
log_reg.fit(X, y);

plot_decision_boundary(X, y, log_reg, False)

# Train an SVM model and plot its decision boundary plus margins

In [None]:
from sklearn.svm import SVC

svm = SVC(kernel = 'linear', C = 100, random_state = 1936)
svm.fit(X, y)

plot_decision_boundary(X, y, svm, True)

# Reduce the regularization penalty to soften the margin

In [None]:
svm = SVC(kernel = 'linear', C = 0.1, random_state = 1936)
svm.fit(X, y)

plot_decision_boundary(X, y, svm, True)

# Split the datasets

In [None]:
from sklearn.model_selection import train_test_split

label_columns = ['target']

training_columns = ['sepal length (cm)', 'sepal width (cm)' , 'petal length (cm)', 'petal width (cm)']

# Split the training and test datasets and their labels.
X_train, X_test, y_train, y_test = train_test_split(data_raw[training_columns],
                                                                            data_raw[label_columns],
                                                                            random_state = 1936)

print('The training and test datasets and their labels have been split.')

# Evaluate an SVM model using a holdout test set

In [None]:
svm = SVC(kernel = 'linear', C = 100, random_state = 1936)
svm.fit(X_train, np.ravel(y_train))

# Score using the test data.
score = svm.score(X_test, y_test)

print('Accuracy: {:.0f}%'.format(score * 100))

# Optimize the SVM model with grid search and cross-validation

In [None]:
from sklearn.model_selection import GridSearchCV

svm = SVC(gamma = 'auto', random_state = 1936)

grid = [{'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
         'C': [0.01, 0.1, 1, 5, 10, 25, 50, 100]}]

search = GridSearchCV(svm, param_grid = grid, scoring = 'accuracy', cv = 5, iid = False)
search.fit(X_train, np.ravel(y_train));

print(search.best_params_)

In [None]:
# Score using the test data.
score = search.score(X_test, y_test)

print('Accuracy: {:.0f}%'.format(score * 100))

# Examine the optimized SVM model's predictions

In [None]:
# Use test set to evaluate.
results_comparison = X_test.copy()
results_comparison['Predicted Iris'] = search.predict(X_test)
results_comparison['Actual Iris'] = y_test.copy()

# Map labels to actual Iris names.
iris_encode = {0: 'setosa', 1: 'versicolor', 2: 'virginica'}
    
results_comparison['Predicted Iris'] = results_comparison['Predicted Iris'].map(iris_encode)
results_comparison['Actual Iris'] = results_comparison['Actual Iris'].map(iris_encode)

# View examples of the predictions compared to actual Iris.
results_comparison.head(20)