In [None]:
#import libraries

import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt
import pandas
import math
from sklearn import svm
%matplotlib inline

In [2]:
#load two Gaussian dataset

dict = sio.loadmat('dataset1')
samples = dict['samples']
labels = dict['labels']
labels = np.reshape(labels, -1) #change 2d vector (n, 1) to 1d vector (n,)

sample_num, sample_dim = samples.shape # N x D
print("Sample number: %d, feature dimensionality: %d" % (sample_num, sample_dim))

col_names=[]
for i in range(sample_dim):
    str='feature%d'%(i+1)
    col_names.append(str)
col_names.append('labels')
pandas.DataFrame(np.concatenate((samples, labels.reshape(sample_num,1)), axis=1), columns=col_names)


Sample number: 2000, feature dimensionality: 2


Unnamed: 0,feature1,feature2,labels
0,0.541078,0.084685,1.0
1,1.835211,0.414565,1.0
2,0.463693,0.600649,1.0
3,0.215385,0.838101,1.0
4,0.402105,1.605083,1.0
5,0.595065,0.079514,1.0
6,0.604953,0.947740,1.0
7,1.126134,-0.553929,1.0
8,0.860798,1.610948,1.0
9,1.414678,1.456316,1.0


In [None]:
# change class label from 1,2 to -1, 1
labels = labels.astype(float)
labels[labels==2] = -1

In [None]:
#plot the dataset, with class labels

plt.plot(samples[labels==1,0],samples[labels==1, 1],'ro')
plt.plot(samples[labels==-1,0],samples[labels==-1, 1],'g^')
plt.legend({'class 1', 'class -1'})
plt.title('A toy dataset with two Gaussian distributions')

In [None]:
# divide the dataset into training and testing sets

idx = list(range(sample_num)) # randomly shuffle indices
np.random.shuffle(idx)  

training_idx = idx[:int(sample_num/2)] # 50% training
training_samples = samples[training_idx]
training_labels = labels[training_idx]

testing_idx = idx[int(sample_num/2):] # 50% testing
testing_samples = samples[testing_idx]
testing_labels = labels[testing_idx]

plt.subplot(1, 2, 1)
#plot the data set, with class labels
plt.plot(training_samples[training_labels==1,0],training_samples[training_labels==1, 1],'ro')
plt.plot(training_samples[training_labels==-1,0],training_samples[training_labels==-1, 1],'g^')
plt.legend({'class 1', 'class 2'})
plt.title('training set')

plt.subplot(1, 2, 2)
#plot the data set, with class labels
plt.plot(testing_samples[testing_labels==1,0],testing_samples[testing_labels==1, 1],'mo')
plt.plot(testing_samples[testing_labels==-1,0],testing_samples[testing_labels==-1, 1],'c^')
plt.legend({'class 1', 'class -1'})
plt.title('testing set')

# Two class Naive Bayes Classifier

In [None]:
class TwoClassNaiveBayesClassifier(object):
    
    def __init__(self,sample_dim):
        self.sample_dim = sample_dim
        

            
    def train(self, training_samples, training_labels):
        
        #u = np.unique(training_labels)
        #if len(u) != 2 or u[0] != -1 or u[1] !=1:
        #    raise Exception('Label range must be -1 and 1')
            
        sample_dim = self.sample_dim
        sample_num = training_samples.shape[0]
        
        #initialize each dim mean and std, assuming Gaussian distribution
        mean_list = np.zeros((sample_dim, 2))   # mean of class 1 and 2, for each dimension
        std_list = np.zeros((sample_dim, 2))    # std of class 1 and 2, for each dimension
        class_prior = np.zeros((2))
        
        #compute class prior
        class_prior[0] = np.count_nonzero (training_labels == 1)/sample_num
        class_prior[1] = np.count_nonzero (training_labels == -1)/sample_num
        
        #compute mean and std of each dim for each class
        for d in range(sample_dim):
            mean_list[d, 0] = np.mean(training_samples[training_labels==1, d])
            mean_list[d, 1] = np.mean(training_samples[training_labels==-1, d])
            std_list[d, 0] = np.std(training_samples[training_labels==1, d])
            std_list[d, 1] = np.std(training_samples[training_labels==-1, d])
        
        
        # store in object for testing phase
        self.class_prior = class_prior
        self.mean_list = mean_list
        self.std_list = std_list
        
    
    def test(self, testing_samples, testing_labels = None):
            
        sample_num = testing_samples.shape[0]
        
        #to store predict label of each sample
        predicted_labels = np.zeros((sample_num))
        
        # get parameters computed during training phase
        class_prior = self.class_prior
        mean_list = self.mean_list
        std_list = self.std_list
        
        for i in range(sample_num):
            xi = testing_samples[i] # pick a sample
            
            xi_posterior_prob = [1, 1] # posterior of xi for class 1 and 2
            
            for d in range(self.sample_dim):
                xi_prob1 = self.GaussianPDF(xi[d], mean_list[d,0], std_list[d, 0]) # multiply probabilities of dim d for class 1 
                xi_posterior_prob[0] *= xi_prob1 
                
                xi_prob2 = self.GaussianPDF(xi[d], mean_list[d,1], std_list[d, 1]) # multiply probabilities of dim d for class 2
                xi_posterior_prob[1] *= xi_prob2
                
            # take class prior into account
            xi_posterior_prob[0] *= class_prior[0]
            xi_posterior_prob[1] *= class_prior[1]
            
            if xi_posterior_prob[0] > xi_posterior_prob[1]:
                predicted_labels [i] = 1
            else:
                predicted_labels [i] = -1
                
        #compute error rate
        if testing_labels is not None:
            error_rate = np.count_nonzero(testing_labels != predicted_labels)/sample_num
        else:
            error_rate = None
        
        return predicted_labels, error_rate
    

    def GaussianPDF(self, x, mean, std):
        prob = 1/(2*math.pi*(std+np.finfo(float).eps)**2)*math.exp(-(x-mean)**2/(2*(std+np.finfo(float).eps)**2))
        return prob
            

In [None]:
#perform training and testing

NBC = TwoClassNaiveBayesClassifier(sample_dim)
NBC.train(training_samples, training_labels)
predicted_labels, error_rate = NBC.test(testing_samples, testing_labels)

In [None]:
#plot ground truth
plt.plot(testing_samples[testing_labels==1,0], testing_samples[testing_labels==1, 1],'ro')
plt.plot(testing_samples[testing_labels==-1,0], testing_samples[testing_labels==-1, 1],'g^')
plt.legend({'class 1', 'class 2'})
plt.title('ground truth')

#plot prediction results
plt.figure()
plt.plot(testing_samples[predicted_labels==1,0],testing_samples[predicted_labels==1, 1],'mo')
plt.plot(testing_samples[predicted_labels==-1,0],testing_samples[predicted_labels==-1, 1],'c^')
plt.legend({'class 1', 'class -1'})
plt.title('prediction results of NBC (error rate: %.03f)' % (error_rate))

In [None]:
# compute the confusion matrix

confusion_matrix = np.zeros((2,2))
class_labels = [-1, 1]
for i in range(2):
    for j in range(2):
        confusion_matrix[i,j] = np.sum(np.logical_and(testing_labels == class_labels[i], predicted_labels == class_labels[j]))

        
pandas.DataFrame(confusion_matrix, index={"true class1", "true class-1"}, columns={'predicted class1', 'predicted class-1'})

In [None]:
# perform cross validation
fold_num = 10
fold_size = int(sample_num/fold_num)

fold_error_rate = []
for k in range(fold_num):
    
    fold_beg = k*fold_size # start index of testing samples in each fold
    fold_end = (k+1)*fold_size     # end idex of testing samples in each fold
    
    testing_samples_cv = samples[idx[fold_beg:fold_end]]
    testing_labels_cv = labels[idx[fold_beg:fold_end]]
    
    training_samples_cv = np.delete(samples,idx[fold_beg:fold_end], 0)
    training_labels_cv = np.delete(labels, idx[fold_beg:fold_end], 0)
    
    NBC.train(training_samples_cv, training_labels_cv)
    predicted_labels, error_rate = NBC.test(testing_samples_cv, testing_labels_cv)
    
    fold_error_rate.append(error_rate)
    
print("error rate of each fold: ", fold_error_rate)
print('mean error of all fold:', np.mean(fold_error_rate))

# Linear Regressor


In [None]:
class LinearRegressor(object):
    
    def __init__(self, sample_dim):
        self.sample_dim = sample_dim
        
    def train(self, training_samples, training_labels, regularizer = 0, show_w=True):
        
        sample_num = training_samples.shape[0]
        
        # check labels range (-1, 1)
        u = np.unique(training_labels)
        if len(u) != 2 or u[0] != -1 or u[1] !=1:
            raise Exception('Label range must be -1 and 1')
            
        # append a 1 to the end of each sample to form extended sample [x 1]'
        X = np.concatenate((training_samples, np.ones((sample_num,1))), axis=1) 
        
        XX = X.T.dot(X)
        Xy = X.T.dot(training_labels)
        w = np.linalg.inv(XX + regularizer*np.eye(XX.shape[0])).dot(Xy)
        
        self.w = w
        
        if show_w:
            print('(w,b)=',w)
        
        return w
    def test(self, testing_samples, testing_labels = None):
        
        sample_num = testing_samples.shape[0]
        
            
         # append a 1 to the end of each sample to form extended sample [x 1]'
        X = np.concatenate((testing_samples, np.ones((sample_num,1))), axis=1) 
        
        y = X.dot(self.w)
        
        predicted_labels = np.ones(sample_num)
        predicted_labels[y<0] = -1

        #compute error rate
        if testing_labels is not None:
            error_rate = np.count_nonzero(testing_labels != predicted_labels)/sample_num
        else:
            error_rate = None
        
        return predicted_labels, error_rate
    

In [None]:
#first without regularization
LR = LinearRegressor(sample_dim)
w = LR.train(training_samples, training_labels)
predicted_labels, error_rate = LR.test(testing_samples, testing_labels)

In [None]:
#plot ground truth of testing samples
plt.plot(testing_samples[testing_labels==1,0], testing_samples[testing_labels==1, 1],'ro')
plt.plot(testing_samples[testing_labels==-1,0], testing_samples[testing_labels==-1, 1],'g^')
plt.legend({'class 1', 'class -1'})
plt.title('ground truth')

#plot prediction results
plt.figure()
plt.plot(testing_samples[predicted_labels==1,0],testing_samples[predicted_labels==1, 1],'mo')
plt.plot(testing_samples[predicted_labels==-1,0],testing_samples[predicted_labels==-1, 1],'c^')
plt.legend({'class 1', 'class -1'})
plt.title('prediction results of LR (error rate: %.03f)' % (error_rate))

# plot decision boundary
if sample_dim ==2:
    minx = min(testing_samples[:, 0])
    maxx = max(testing_samples[:, 0]) 
    
    x = np.linspace(minx, maxx, testing_samples.shape[0])
    y = -(w[0]*x+w[2])/w[1]
    
    plt.plot(x,y,'g')

In [None]:
# confusion matrix of LR
confusion_matrix = np.zeros((2,2))
class_labels = [-1, 1]
for i in range(2):
    for j in range(2):
        confusion_matrix[i,j] = np.sum(np.logical_and(testing_labels == class_labels[i], predicted_labels == class_labels[j]))

        
pandas.DataFrame(confusion_matrix, index={"true class1", "true class-1"}, columns={'predicted class1', 'predicted class-1'})

In [None]:
# smaller training set, without regularization
LR.train(training_samples[:10], training_labels[:10])
predicted_labels, error_rate = LR.test(testing_samples, testing_labels)
print('error rate: %.03f' % (error_rate))

In [None]:
# smaller training set, with regularization
LR.train(training_samples[:10], training_labels[:10], regularizer=10)
predicted_labels, error_rate = LR.test(testing_samples, testing_labels)
print('error rate: %.03f' % (error_rate))

In [None]:
#tuning parameter regularizer to have the best performance

# show error rate change v.s. regularizer (0 - 100)

ER = []
for regularizer in range(100):
    LR.train(training_samples[:10], training_labels[:10], regularizer=regularizer, show_w=False)
    predicted_labels, error_rate = LR.test(testing_samples, testing_labels)
    ER.append(error_rate)
    
plt.plot(list(range(100)), ER,'-x')
plt.xlabel('regularizer')
plt.ylabel('error rate')

In [None]:
# try a larger regularizer range, i.e. (0 - 1000)

ER = []
for regularizer in range(1000):
    LR.train(training_samples[:10], training_labels[:10], regularizer=regularizer, show_w=False)
    predicted_labels, error_rate = LR.test(testing_samples, testing_labels)
    ER.append(error_rate)
    
plt.plot(list(range(1000)), ER,'-x')
plt.xlabel('regularizer')
plt.ylabel('error rate')

In [None]:
# perform cross validation
fold_num = 10
fold_size = int(sample_num/fold_num)

fold_error_rate = []
for k in range(fold_num):
    
    fold_beg = k*fold_size # start index of testing samples in each fold
    fold_end = (k+1)*fold_size     # end idex of testing samples in each fold
    
    testing_samples_cv = samples[idx[fold_beg:fold_end]]
    testing_labels_cv = labels[idx[fold_beg:fold_end]]
    
    training_samples_cv = np.delete(samples,idx[fold_beg:fold_end], 0)
    training_labels_cv = np.delete(labels, idx[fold_beg:fold_end], 0)
    
    LR.train(training_samples_cv, training_labels_cv, show_w=False)
    predicted_labels, error_rate = LR.test(testing_samples_cv, testing_labels_cv)
    
    fold_error_rate.append(error_rate)
    
print("error rate of each fold: ", fold_error_rate)
print('mean error of all fold:', np.mean(fold_error_rate))

In [None]:
# compare performances of NBC and LR for different training data size

nbc_errors = []
lr_errors = []
ts_start, ts_stop, ts_step = 10, 100, 2 #  training data size list
for ts in range(ts_start, ts_stop, ts_step):
    
    #divide training and testing
    np.random.shuffle(idx) 
    training_idx = idx[:int(ts)]
    training_samples = samples[training_idx]
    training_labels = labels[training_idx]

    testing_idx = idx[int(ts):]
    testing_samples = samples[testing_idx]
    testing_labels = labels[testing_idx]
    
    #NBC
    NBC.train(training_samples, training_labels)
    predicted_labels, error_rate = NBC.test(testing_samples, testing_labels)
    nbc_errors.append(error_rate)
    
    #LR
    LR.train(training_samples, training_labels, show_w=False)
    predicted_labels, error_rate = LR.test(testing_samples, testing_labels)
    lr_errors.append(error_rate)
    

plt.plot(list(range(ts_start, ts_stop, ts_step)), nbc_errors, 'r-o')
plt.plot(list(range(ts_start, ts_stop, ts_step)), lr_errors, 'b-^')
plt.legend({'NBC errors', 'LR errors'})
plt.xlabel('training set size')
plt.ylabel('error rate')

# Linear Discriminant Analysis

In [None]:
class LinearDiscriminantAnalysis(object):
    
    def __init__(self, sample_dim):
        self.sample_dim = sample_dim
        
    def train(self, training_samples, training_labels):
        
        sample_num = training_samples.shape[0]
        
        # check labels  {-1, 1}
        u = np.unique(training_labels)
        if len(u) != 2 or u[0] != -1 or u[1] !=1:
            raise Exception('Label range must be -1 and 1')
            
        # mean vector of each class before projection
        u0 = np.mean(training_samples[training_labels==1], axis=0)
        u1 = np.mean(training_samples[training_labels==-1], axis=0)
        
        # covariance matrix for each class before projection
        Cov0 = np.zeros((self.sample_dim, self.sample_dim))
        Cov1 = np.zeros((self.sample_dim, self.sample_dim))
        for i in range(sample_num):

            if training_labels[i]==1:
                Cov0 += np.outer(training_samples[i]-u0, training_samples[i]-u0) #correction: PPT formula is wrong!
            elif training_labels[i]==-1:
                Cov1 += np.outer(training_samples[i]-u1, training_samples[i]-u1)
            else:
                raise Exception('Unrecognized class label!')
                
        # within class scatter matrix
        Sw = Cov0/np.count_nonzero(training_labels==1) + Cov1/np.count_nonzero(training_labels==-1) 
        
        # optimal projection vector
        w = np.dot(np.linalg.inv(Sw), u0-u1)
        
        # mean of each class after projection
        m0 = np.dot(w, u0)
        m1 = np.dot(w, u1)
        
        #save variables for testing stage
        self. w = w
        self. m0 = m0
        self. m1 = m1
        
        return w, u0, u1
    def test(self, testing_samples, testing_labels = None):
        
        sample_num = testing_samples.shape[0]
        
        #restore variables
        w = self.w
        m0 = self.m0
        m1 = self.m1
        
        predicted_labels = np.zeros(sample_num)
        for i in range(sample_num):
            x = testing_samples[i]
            
            # project to w
            xp = np.dot(w, x)
            
            # compute distance to class 0 and class 1
            d0 = np.abs(xp-m0)
            d1 = np.abs(xp-m1)
            
            # classify to nearest class
            if d0<d1:
                predicted_labels[i] = 1
            else:
                predicted_labels[i] = -1

        #compute error rate
        if testing_labels is not None:
            error_rate = np.count_nonzero(testing_labels != predicted_labels)/sample_num
        else:
            error_rate = None
        
        return predicted_labels, error_rate
    

In [None]:
#first without regularization
LDA = LinearDiscriminantAnalysis(sample_dim)
w, u0, u1 = LDA.train(training_samples, training_labels)
predicted_labels, error_rate = LDA.test(testing_samples, testing_labels)

In [None]:
#plot ground truth of testing samples
plt.plot(testing_samples[testing_labels==1,0], testing_samples[testing_labels==1, 1],'ro')
plt.plot(testing_samples[testing_labels==-1,0], testing_samples[testing_labels==-1, 1],'g^')
plt.legend({'class 1', 'class -1'})
plt.title('ground truth')

#plot prediction results
plt.figure()
plt.plot(testing_samples[predicted_labels==1,0],testing_samples[predicted_labels==1, 1],'mo')
plt.plot(testing_samples[predicted_labels==-1,0],testing_samples[predicted_labels==-1, 1],'c^')
plt.legend({'class 1', 'class -1'})
plt.title('prediction results of LDA (error rate: %.03f)' % (error_rate))

#plot center after projection
plt.plot(u0[0], u0[1], 'ko')
plt.plot(u1[0], u1[1], 'k^')



# Logistic Regression

In [None]:
class LogisticRegression(object):
    
    def __init__(self, sample_dim):
        self.sample_dim = sample_dim
        
    def train(self, training_samples, training_labels, learning_rate=0.01, max_iterations=100):
        
        # check labels range (-1, 1)
        
        u = np.unique(training_labels)
        if len(u) != 2 or u[0] != 0 or u[1] !=1:
            raise Exception('Label range must be 0 and 1')
                
        sample_dim = self.sample_dim
        sample_num = training_samples.shape[0]
        
        #init w and b
        w = np.random.rand(sample_dim)
        b = 0
        
        step = 0
        while step<max_iterations:
            
            dw = np.zeros(sample_dim)
            db = 0
            
            for i in range(sample_num):
                
                xi, yi = training_samples[i], training_labels[i]
                
                pi = 1-1/(1+np.exp(np.dot(w, xi)+b)) # p(yi=1|xi, w, b) in dw equation
                
                dw += (xi*yi - xi*pi)
                db += (yi - pi)
            
            dw = -dw
            db = -db
            w -= learning_rate*dw
            b -= learning_rate*db
            
            step += 1
            
        self.w = w
        self.b = b
        return w, b
    def test(self, testing_samples, testing_labels = None):
        
        sample_num = testing_samples.shape[0]
        
        #restore variables
        w = self.w
        b = self.b
        
        predicted_labels = np.zeros(sample_num)
        for i in range(sample_num):
            x = testing_samples[i]
            
            pi = 1-1/(1+np.exp(np.dot(w, x)+b)) # p(yi=1|xi, w, b)
            
            
            # classify to nearest class
            if pi<0.5:
                predicted_labels[i] = 0
            else:
                predicted_labels[i] = 1

        #compute error rate
        if testing_labels is not None:
            error_rate = np.count_nonzero(testing_labels != predicted_labels)/sample_num
        else:
            error_rate = None
        
        return predicted_labels, error_rate
        

In [None]:
#convert labels from {-1, 1} to {0, 1}
training_labels2 = training_labels.copy()
training_labels2[training_labels2==-1] = 0
testing_labels2 = testing_labels.copy()
testing_labels2[testing_labels2==-1] = 0

LogR = LogisticRegression(sample_dim)
w,b = LogR.train(training_samples, training_labels2, learning_rate=0.01, max_iterations=10)
predicted_labels, error_rate = LogR.test(testing_samples, testing_labels2)

In [None]:
#plot ground truth of testing samples
plt.plot(testing_samples[testing_labels2==1,0], testing_samples[testing_labels2==1, 1],'ro')
plt.plot(testing_samples[testing_labels2==0,0], testing_samples[testing_labels2==0, 1],'g^')
plt.legend({'class 1', 'class 0'})
plt.title('ground truth')

#plot prediction results
plt.figure()
plt.plot(testing_samples[predicted_labels==1,0],testing_samples[predicted_labels==1, 1],'mo')
plt.plot(testing_samples[predicted_labels==0,0],testing_samples[predicted_labels==0, 1],'c^')
plt.legend({'class 1', 'class 0'})
plt.title('prediction results of LR (error rate: %.03f)' % (error_rate))


# SVM Classifier

In [None]:
#SVM with linear kernal
clf = svm.SVC(kernel = 'linear', C=1)
clf.fit(training_samples, training_labels)  
predicted_labels = clf.predict(testing_samples)
svm_error_rate = np.sum(predicted_labels != testing_labels)/len(testing_labels)
print('SVM error rate: %.03f' % svm_error_rate)

In [None]:
#plot ground truth of testing samples
plt.plot(testing_samples[testing_labels==1,0], testing_samples[testing_labels==1, 1],'ro')
plt.plot(testing_samples[testing_labels==-1,0], testing_samples[testing_labels==-1, 1],'g^')
plt.legend({'class 1', 'class -1'})
plt.title('ground truth')

#plot prediction results
plt.figure()
plt.plot(testing_samples[predicted_labels==1,0],testing_samples[predicted_labels==1, 1],'mo')
plt.plot(testing_samples[predicted_labels==-1,0],testing_samples[predicted_labels==-1, 1],'c^')
plt.legend({'class 1', 'class -1'})
plt.title('prediction results of SVM (error rate: %.03f)' % (error_rate))

In [None]:
#plot support vectors
plt.figure()
SVs = clf.support_vectors_
nSV = clf.n_support_
plt.plot(training_samples[training_labels==1,0],training_samples[training_labels==1, 1],'mo', label = 'class 1')
plt.plot(training_samples[training_labels==-1,0],training_samples[training_labels==-1, 1],'c^', label='class -1')
plt.plot(SVs[0:nSV[0],0], SVs[0:nSV[0],1],'g*', label='class 1 SVs')
plt.plot(SVs[nSV[0]:,0], SVs[nSV[0]:,1],'b*', label='class -1 SVs')
plt.legend()
plt.title('Support vectors of linear kernel')

In [None]:
# support vector with Gaussian kernel (Radial Basis Kernel)

clf = svm.SVC(kernel = 'rbf', C=0.1, gamma=0.5)
clf.fit(training_samples, training_labels)  
predicted_labels = clf.predict(testing_samples)
svm_error_rate = np.sum(predicted_labels != testing_labels)/len(testing_labels)
print('SVM error rate: %.03f' % svm_error_rate)

In [None]:
#plot support vectors
plt.figure()
SVs = clf.support_vectors_
nSV = clf.n_support_
plt.plot(training_samples[training_labels==1,0],training_samples[training_labels==1, 1],'mo', label = 'class 1')
plt.plot(training_samples[training_labels==-1,0],training_samples[training_labels==-1, 1],'c^', label='class -1')
plt.plot(SVs[0:nSV[0],0], SVs[0:nSV[0],1],'g*', label='class 1 SVs')
plt.plot(SVs[nSV[0]:,0], SVs[nSV[0]:,1],'b*', label='class -1 SVs')
plt.legend()
plt.title('Support vectors of Gaussian kernel')

In [None]:
# compare performances of NBC, LR, LDA, SVM for different training data size

nbc_errors = []
lr_errors = []
lda_errors = []
logR_errors = []
svm_errors = []
ts_start, ts_stop, ts_step = 30, 200, 10 #  training data size list
for ts in range(ts_start, ts_stop, ts_step):
    
    #divide training and testing
    np.random.shuffle(idx) 
    training_idx = idx[:int(ts)]
    training_samples = samples[training_idx]
    training_labels = labels[training_idx]

    testing_idx = idx[int(ts):]
    testing_samples = samples[testing_idx]
    testing_labels = labels[testing_idx]
    
    #NBC
    NBC.train(training_samples, training_labels)
    predicted_labels, error_rate = NBC.test(testing_samples, testing_labels)
    nbc_errors.append(error_rate)
    
    #LR
    LR.train(training_samples, training_labels, show_w=False)
    predicted_labels, error_rate = LR.test(testing_samples, testing_labels)
    lr_errors.append(error_rate)
    
    #LDA
    LDA.train(training_samples, training_labels)
    predicted_labels, error_rate = LDA.test(testing_samples, testing_labels)
    lda_errors.append(error_rate)
    
    #LogR
    training_labels2 = training_labels.copy()
    training_labels2[training_labels2==-1]=0
    testing_labels2 = testing_labels.copy()
    testing_labels2[testing_labels2==-1] = 0
    LogR.train(training_samples, training_labels2)
    predicted_labels, error_rate = LogR.test(testing_samples, testing_labels2)
    logR_errors.append(error_rate)
    
    clf.fit(training_samples, training_labels)  
    predicted_labels = clf.predict(testing_samples)
    svm_error_rate = np.sum(predicted_labels != testing_labels)/len(testing_labels)
    svm_errors.append(svm_error_rate)

plt.plot(list(range(ts_start, ts_stop, ts_step)), nbc_errors, 'r-o')
plt.plot(list(range(ts_start, ts_stop, ts_step)), lr_errors, 'g-^')
plt.plot(list(range(ts_start, ts_stop, ts_step)), lda_errors, 'b-^')
plt.plot(list(range(ts_start, ts_stop, ts_step)), logR_errors, 'c-^')
plt.plot(list(range(ts_start, ts_stop, ts_step)), svm_errors, 'k-^')
plt.legend({'NBC errors', 'LR errors', 'LDA errors', 'LogR errors','SVM errors'})
plt.xlabel('training set size')
plt.ylabel('error rate')

# Cross Validation on Watermelon 3.0

In [None]:
#load data
dict = sio.loadmat('watermelon')
dataset = dict['dataset']
samples = dataset[:, :8]
labels = dataset[:, -1]
labels = np.reshape(labels, -1) #change 2d vector (n, 1) to 1d vector (n,)
labels[labels==0]=-1

sample_num, sample_dim = samples.shape # N x D
print("Sample number: %d, feature dimensionality: %d" % (sample_num, sample_dim))

col_names=[]
for i in range(sample_dim):
    str='feature%d'%(i+1)
    col_names.append(str)
col_names.append('labels')

idx = list(range(sample_num)) # randomly shuffle indices
np.random.shuffle(idx)  
samples = samples[idx]
labels = labels[idx]
pandas.DataFrame(np.concatenate((samples, labels.reshape(sample_num,1)), axis=1), columns=col_names)

In [None]:
fold_num = 5
num_per_fold = int(sample_num/fold_num) # 3

nbc_errors = []
lr_errors = []
lda_errors = []
logR_errors = []
svm_errors = []

NBC = TwoClassNaiveBayesClassifier(sample_dim)
LR = LinearRegressor(sample_dim)
LDA = LinearDiscriminantAnalysis(sample_dim)
LogR = LogisticRegression(sample_dim)
SVMC = svm.SVC(kernel = 'linear', C=1)

for k in range(fold_num):
    
    #divide training and testing set
    testing_idx = list(range(k*num_per_fold,(k+1)*num_per_fold))
    testing_samples = samples[testing_idx]
    testing_labels = labels[testing_idx]

    training_samples = np.delete(samples, testing_idx, 0)
    training_labels = np.delete(labels,testing_idx)
    print('fold %d, testing samples:' % k, testing_idx)
    print(testing_samples)
    print('\n')

     #NBC
    NBC.train(training_samples, training_labels)
    predicted_labels, error_rate = NBC.test(testing_samples, testing_labels)
    nbc_errors.append(error_rate)
    
    #LR
    LR.train(training_samples, training_labels, show_w=False)
    predicted_labels, error_rate = LR.test(testing_samples, testing_labels)
    lr_errors.append(error_rate)
    
    #LDA
    LDA.train(training_samples, training_labels)
    predicted_labels, error_rate = LDA.test(testing_samples, testing_labels)
    lda_errors.append(error_rate)
    
    #LogR
    training_labels2 = training_labels.copy()
    training_labels2[training_labels2==-1]=0
    testing_labels2 = testing_labels.copy()
    testing_labels2[testing_labels2==-1] = 0
    LogR.train(training_samples, training_labels2, learning_rate=0.05, max_iterations=200)
    predicted_labels, error_rate = LogR.test(testing_samples, testing_labels2)
    logR_errors.append(error_rate)
    
    #SVM
    SVMC.fit(training_samples, training_labels)  
    predicted_labels = SVMC.predict(testing_samples)
    svm_error_rate = np.sum(predicted_labels != testing_labels)/len(testing_labels)
    svm_errors.append(svm_error_rate)


In [None]:
print('NBC: %.03f'%np.mean(nbc_errors))
print('LR: %0.03f'%np.mean(lr_errors))
print('LDA: %.03f'%np.mean(lda_errors)) 
print('LogR: %.03f'%np.mean(logR_errors))
print('SVM: %.03f'%np.mean(svm_errors))


A: Because this is not a linear-separable dataset, so some of the points are incorrectly classified points with slackness.