# Import libraries

In [None]:
import numpy as np
import random
import time
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_digits

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import KFold
from sklearn.base import clone

# Load datasets

In [None]:
X, y = load_iris(return_X_y = True)
#X, y = load_breast_cancer(return_X_y = True)
#X, y = load_digits(return_X_y = True)

# Get labels for dataset

In [None]:
target_labels = list(np.unique(y))

# Initialise Models

In [None]:
#Classifier 1 (for online training)
model1 = SGDClassifier(random_state = 19)
#model1 = Perceptron(random_state = 19)
#model1 = PassiveAggressiveClassifier(random_state = 19)


#Classifier 2 (50% batch and 50% online training)
model2 = SGDClassifier(random_state = 19)
#model2 = Perceptron(random_state = 19)
#model2 = PassiveAggressiveClassifier(random_state = 19)


#Classifier 3 (for batch training)
clf3 = SGDClassifier(random_state = 19)
#clf3 = Perceptron(random_state = 19)
#clf3 = PassiveAggressiveClassifier(random_state = 19)

In [None]:
clf1_accuracy_list = []
clf2_accuracy_list = []
clf3_accuracy_list = []

# Set number of splits (10 for 10-fold Cross Validation)

In [None]:
number_of_splits = 10
kf = KFold(n_splits = number_of_splits, shuffle = True, random_state = 42) #shuffle = True to get random indexes for test sets

# 10-fold Cross Validation to get average accuracy of each of the 2 models

In [None]:
fold_value = 1
for train_index, test_index in kf.split(X):
    
    random.seed(2020)
    random.shuffle(train_index) #shuffle train indexes so that training will be less biased
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    
    ###Full Online Learning
    X_train_samples_clf1 = []
    for X_train_sample_clf1 in X_train:
        X_train_samples_clf1.append(X_train_sample_clf1.reshape(1, -1)) #reshape (n, ) sample to (1, n) array for partial_fit

    y_train_samples_clf1 = []
    for y_train_sample_clf1 in y_train:
        y_train_samples_clf1.append([y_train_sample_clf1]) #convert numerical label to a list containing that 1 label for partial_fit

    clf1 = clone(model1) #clone base model so model that is used for partial_fit will reset at every fold
    start = time.time()
    print("\nOnline Learning started")
    for train_data_index_clf1 in range(len(X_train_samples_clf1)):
        clf1.partial_fit(X_train_samples_clf1[train_data_index_clf1], y_train_samples_clf1[train_data_index_clf1], classes = target_labels)
    print("Online Learning Ended")
    end = time.time()
    print("Time elapsed:", end-start, "seconds")
    ###End of Online Learning
    
    
    
    ###50% Batch and 50% Online Learning
    X_train_samples_clf2 = []
    for X_train_sample_clf2 in X_train[int(len(X_train)/2):]:
        X_train_samples_clf2.append(X_train_sample_clf1.reshape(1, -1))

    y_train_samples_clf2 = []
    for y_train_sample_clf2 in y_train[int(len(y_train)/2):]:
        y_train_samples_clf2.append([y_train_sample_clf2])
        
    clf2 = clone(model2)
    start = time.time()
    print("\nPartial Online Learning started")
    clf2.partial_fit(X_train[:int(len(X_train)/2)], y_train[:int(len(y_train)/2)], classes = target_labels)
    for train_data_index_clf2 in range(len(X_train_samples_clf2)):
        clf2.partial_fit(X_train_samples_clf2[train_data_index_clf2], y_train_samples_clf2[train_data_index_clf2], classes = target_labels)
    print("Partial Online Learning Ended")
    end = time.time()
    print("Time elapsed:", end-start, "seconds")
    ###End of 50% Batch and 50% Online Learning
    
    
    
    ###Full Batch Learning
    start = time.time()
    print("\nBatch Learning Started")
    clf3.fit(X_train, y_train)
    print("Batch Learning Ended")
    end = time.time()
    print("Time elapsed:", end-start, "seconds")
    ###End of Batch Learning
    
    
    
    ###Scoring
    print("\nScoring on Online Learning model")
    clf1_accuracy_list.append(clf1.score(X_test, y_test))
    
    print("Scoring on Partial Online Learning model")
    clf2_accuracy_list.append(clf2.score(X_test, y_test))
    
    print("Scoring on Batch Learning model")
    clf3_accuracy_list.append(clf3.score(X_test, y_test))
    
    print("Fold", fold_value, "/", number_of_splits, "completed")
    fold_value = fold_value + 1

print("\nAverage accuracy of Online Learning:", np.mean(clf1_accuracy_list))
print("Average accuracy of Partial Online Learning:", np.mean(clf2_accuracy_list))
print("Average accuracy of Batch Learning:", np.mean(clf3_accuracy_list))

# Plot graph of accuracy across 10-folds

In [None]:
plt.plot(range(1, number_of_splits + 1), clf1_accuracy_list, label = "Full Online Learning Accuracy");    
plt.plot(range(1, number_of_splits + 1), clf2_accuracy_list, label = "Partial Online Learning Accuracy");    
plt.plot(range(1, number_of_splits + 1), clf3_accuracy_list, label = "Full Batch Learning Accuracy");
plt.title("Accuracy across 10-folds")
plt.ylabel("Accuracy")
plt.xlabel("Fold Number")
plt.legend(loc = "best", fontsize = 7.8)