# Machine Learning Lab 6

Classification and group coding exercise.

## Imports

In [4]:
# Data and Datasets
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.datasets import load_wine

# Validation methods
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

# Metrics
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

# Classifiers
from sklearn.neighbors import KNeighborsClassifier
from classifiers import k_nearest_neighbour # the K-NN python file in the 'classifiers' folder
from classifiers import random_classifier # the Random Classifier python file in the 'classifiers' folder

# Plotting & utils
import pprint
import matplotlib.pyplot as plt
import numpy as np
from time import time

ModuleNotFoundError: No module named 'classifiers'

In [5]:
# Loading dataset
dataset = load_iris()
X = dataset.data
y = dataset.target

## Random Classifier

In [6]:
# Splitting dataset for hold-out validation
X_train, X_test, y_train, y_test = train_test_split(X, # feature matrix
                                                    y, # label vector
                                                    test_size=0.2, # 20% test / 80% traing
                                                    random_state=1, # fixed random seed
                                                    stratify=dataset.target # split dataset proportional to classes
                                                   )

# Instantiating model
model = random_classifier.RandomClassifier()

# Training model
model = model.fit(X_train, y_train)

# Testing model
y_pred = model.predict(X_test)

# Printing out confusion matrix and accuracy
print(metrics.confusion_matrix(y_test, y_pred))
print("\nAccuracy (Testing):  %0.2f " % (metrics.accuracy_score(y_test, y_pred)))

NameError: name 'random_classifier' is not defined

## Bespoke K-NN

In [9]:
# Splitting dataset for hold-out validation
X_train, X_test, y_train, y_test = train_test_split(X,y)

# Instantiating model
model = KNeighborsClassifier(n_neighbors=k)

# Training model
model = model.fit(X_train, y_train)

# Testing model
y_pred = model.predict(X_test)

# Printing out confusion matrix and accuracy
print(metrics.confusion_matrix(y_test, y_pred))
print("\nAccuracy (Testing):  %0.2f " % (metrics.accuracy_score(y_test, y_pred)))

[[10  0  0]
 [ 0 15  0]
 [ 0  0 13]]

Accuracy (Testing):  1.00 


## Evaluation

Lab tasks on hold-out and cross-validation, etc.

Doing this on the ``wine`` dataset only.

In [None]:
# Loading dataset
dataset = load_wine()
X = dataset.data
y = dataset.target

# Converting into a Pandas DataFrame
wine_df = pd.DataFrame(data= np.c_[X, y], columns= dataset['feature_names'] + ['target'])

# Converting the target column to int (from float)
wine_df['target'] = pd.to_numeric(wine_df['target'], downcast='integer')

# Printing out the top 5 rows of the DataFrame
wine_df.head()

In [None]:
wine_df.info()

In [None]:
wine_df.describe()

In [None]:
plt.figure()
plt.suptitle('Class Distribution')
plt.ylabel('Frequency')
plt.xlabel('Class')
wine_df['target'].value_counts().plot(kind="bar")

### Utility functions

In [None]:
result_df = pd.DataFrame(columns=['Model','Validation Method','Accuracy','STDEV', 'Training time (s)', 'Testing time (s)'])

def add_result(model_name, validation_method, accuracy_list, fit_time_list, predict_time_list):
    """
    Add a result row to a pandas dataframe (created above) for a model with name (model_name),
    the validation method used, and the results (accuracy_list, fit_time_list, and predict_time_list).
    """
    global result_df # doing this to be able to access the gloval dataframe defined above the function
    
    new_row = {'Model':model_name, 
               'Validation Method':validation_method, 
               'Accuracy':accuracy_list.mean(), 
               'STDEV':accuracy_list.std(),
               'Training time (s)':fit_time_list.mean(), 
               'Testing time (s)':predict_time_list.mean()}
    result_df = result_df.append(new_row, ignore_index=True)
    
    return result_df

In [None]:
def hold_out_validation(X, y, model, num_runs=1000, test_size=0.2):
    """
    Performs hold-out validation of a given model on the dataset provided (X, y).
    The default number of runs is 1000, and the default training/test split is 80/20.
    Returns 3 lists: accuracy, training time and testing time
    """
    
    # array(s) for storing performance metrics
    accuracy_list = np.array([])
    fit_time_list = np.array([])
    predict_time_list = np.array([])
    
    for x in range(num_runs):
        # Hold-out validation - 80% training and 20% testing
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=None)

        # Training and testing the model
        start_time = time()
        model.fit(X_train, y_train)
        end_time = time()
        fit_time_list = np.append(fit_time_list, end_time-start_time)
        
        # Testing the model
        start_time = time()
        y_pred = model.predict(X_test)
        end_time = time()
        predict_time_list = np.append(predict_time_list, end_time-start_time)

        # Appending performance metrics to arrays created above
        accuracy_list = np.append(accuracy_list, accuracy_score(y_test, y_pred))
    
    return accuracy_list, fit_time_list, predict_time_list

### Hold-out validation

In [None]:
# Instantiating model
model = random_classifier.RandomClassifier()

# Hold-out-validation
scores_random, fit_time_list, predict_time_list = hold_out_validation(X, y, model, 100, 0.2)

add_result('Random', 'Hold-Out', scores_random, fit_time_list, predict_time_list)

print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores_random.mean(), scores_random.std() * 2))

In [None]:
# Instantiating model
k = 10
model = k_nearest_neighbour.KNNClassifier(k=k)

# Hold-out-validation
scores_knn_bespoke, fit_time_list, predict_time_list = hold_out_validation(X, y, model, 100, 0.2)

add_result('K-NN (bespoke)', 'Hold-Out', scores_knn_bespoke, fit_time_list, predict_time_list)

print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores_knn_bespoke.mean(), scores_knn_bespoke.std() * 2))

In [3]:
# Instantiating model
k = 10
model = KNeighborsClassifier(n_neighbors=k)

# Hold-out-validation
scores_knn_sklearn, fit_time_list, predict_time_list = hold_out_validation(X, y, model, 100, 0.2)

add_result('K-NN (sklearn)', 'Hold-Out', scores_knn_sklearn, fit_time_list, predict_time_list)

print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores_knn_sklearn.mean(), scores_knn_sklearn.std() * 2))

NameError: name 'hold_out_validation' is not defined

In [None]:
# plotting a histogram of the accuracy for the 3 classifiers
plt.figure()
plt.hist(scores_random, bins=10, alpha=0.5, label='Random')
plt.hist(scores_knn_bespoke, bins=10, alpha=0.5, label='K-NN (bespoke)')
plt.hist(scores_knn_sklearn, bins=10, alpha=0.5, label='K-NN (sklearn)')
plt.legend(loc='upper right')
plt.xlabel('Accuracy')
plt.ylabel('Frequency')
plt.show()

### Cross-validation

In [None]:
# Instantiating model
model = model = random_classifier.RandomClassifier()

# 5-fold cross validation
# StratifiedKFold is used when cv is set to None or an Integer (with a classification model)
scores = cross_validate(model, X, y, cv=5, return_train_score=True, return_estimator=True)

add_result('Random', 'CV', scores['test_score'], scores['fit_time'], scores['score_time'])

# Printing results
print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))

In [None]:
# Instantiating model
k = 10
model = k_nearest_neighbour.KNNClassifier(k=k)

# 5-fold cross validation
# StratifiedKFold is used when cv is set to None or an Integer (with a classification model)
scores = cross_validate(model, X, y, cv=5, return_train_score=True, return_estimator=True)

add_result('K-NN (bespoke)', 'CV', scores['test_score'], scores['fit_time'], scores['score_time'])

# Printing results
print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))

In [None]:
# Instantiating model
k = 10
model = KNeighborsClassifier(n_neighbors=k)

# 5-fold cross validation
# StratifiedKFold is used when cv is set to None or an Integer (with a classification model)
scores = cross_validate(model, X, y, cv=5, return_train_score=True, return_estimator=True)

add_result('K-NN (sklearn)', 'CV', scores['test_score'], scores['fit_time'], scores['score_time'])

# Printing results
print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))

In [None]:
result_df