In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split #Please remove this line and implement your own CV
import seaborn as sns
%matplotlib inline

In [2]:
#Function is to generate the center 
def generate_center(n=20,mu1=(0,1),mu2=(1,0),sigma=0.5):
    center_1 = np.random.normal(size=(int(n/2),2),loc=mu1)
    center_2 = np.random.normal(size=(int(n/2),2),loc=mu2)
    center = np.vstack((center_1,center_2))
    return center

#Function to generate data 
def generate_data(n_size=100,center=[[0,1]],sigma=0.5):
    center_0_data = np.empty(shape=[1,0],dtype=int)
    center_1_data = np.empty(shape=[1,0],dtype=int)
    x_train = np.empty(shape=[0,2])
    y_train = np.empty(shape=[0,1],dtype=int)
    for num_sample in np.arange(n_size):
        rand_mu1 = np.random.randint(1,10) #Generate random number between 1 & 10 (mean = (0,1))
        rand_mu2 = np.random.randint(11,20) #Generate random number between 1 & 10 (mean = (1,0))
        center_0_data = np.append(center_0_data,rand_mu1) #Just to check how many random from which centers
        center_1_data = np.append(center_1_data,rand_mu2) #Just to check how many random from which centers
        center_mu1 = center[rand_mu1,:]
        center_mu2 = center[rand_mu2,:]
        #print ("Rand Mu1:{} Mu2:{} Center Mu1:{} Center Mu2:{}".format(rand_mu1,rand_mu2,center_mu1,center_mu2))
        x_train_class_0 = np.random.normal(loc=center_mu1,scale=sigma)
        x_train_class_1 = np.random.normal(loc=center_mu2,scale=sigma)
        y_train_class_0 = 0
        y_train_class_1 = 1
        x_train_ite = np.vstack((x_train_class_0,x_train_class_1))
        y_train_ite = np.append(y_train_class_0,y_train_class_1)
        x_train = np.vstack((x_train,x_train_ite))
        y_train = np.append(y_train,y_train_ite)
        
        #x_train = np.concatenate((x_train_class_0,x_train_class_1))
        #y_train = np.concatenate((y_train_class_0,y_train_class_1))
    return x_train,y_train#,center_0_data,center_1_data

'''
def generate_data(n_size=200,mu1=(0,1),mu2=(1,0),sigma=0.5, radom_seed=665462521):
    x_train_class_0 = np.random.normal(size=((int(n_size/2)),2),loc=mu1,scale=sigma)
    x_train_class_1 = np.random.normal(size=((int(n_size/2)),2),loc=mu2,scale=sigma)
    y_train_class_0 = [0]*int(n_size/2)
    y_train_class_1 = [1]*int(n_size/2)
    x_train = np.concatenate((x_train_class_0,x_train_class_1))
    y_train = np.concatenate((y_train_class_0,y_train_class_1))
    return x_train,y_train
'''
def plot_data(x_train,mu1=(0,1),mu2=(1,0)):
    n = int(x_train.shape[0]/2)
    plt.figure(figsize=(8,6))
    plt.scatter(x_train[:n,0],x_train[:n,1],label = "Class 0",alpha = 0.3)
    plt.scatter(x_train[n:,0],x_train[n:,1],label = "Class 1",alpha = 0.3)
    plt.scatter(mu1[0],mu1[1],marker="+",color="blue",s=200)
    plt.scatter(mu2[0],mu2[1],marker="+",color="red",s=200)
    plt.legend()
    plt.show()
    
def train_linear_mode(x_train,y_train):
    lin_reg = LinearRegression()
    lin_reg.fit(x_train,y_train)
    
def linear_model_predict(data,model):
    y_predict_val = lin_reg.predict(data)
    y_pred = [1 if i >= 0.5 else 0 for i in y_predict_val]
    return y_pred

def bayes_predict(data, mu1=(0,1),mu2=(1,0)):
    bayes_val = 2*np.dot(data, np.subtract(mu2, mu1)) - (np.dot(mu2, mu2) - np.dot(mu1, mu2))
    bayes_pred = [i > 0 for i in bayes_val]
    return bayes_pred
    
def error_cal(y_true,y_predict):
    n = pd.Series(y_predict).shape[0]
    lin_error = np.sum(y_true != y_predict)/n
    return lin_error    
    
def plot_boxplot(data,labels,title = "Box Plot"):
    fig1, ax1 = plt.subplots()
    ax1.set_title(title)
    ax1.boxplot(data,labels=labels)
    plt.show()

In [None]:
#Set the Seed to have consistent output for all Runs
random_seed = 665462521
np.random.seed(random_seed)
num_iteration = 20
num_center = 20
k = np.arange(1,181) # Number of K for KNearestNeighbors
lin_error = np.zeros([20,2])
bayes_lin_error = np.zeros([20,2])
lin_quad_error = np.zeros([20,2])
bayes_lin_quad_error = np.zeros([20,2])
print_ind = False

#Generate the 20 Centers
center = generate_center(num_center)

#Loop through the 20 iterations to generate 20 Train / Test Samples & Fit -> Predict 
for i in np.arange(num_iteration):
    #Generate the Train & Test Data based on the Center
    x_train,y_train = generate_data(n_size=100,center=center)
    x_test,y_test = generate_data(n_size=5000,center=center)

    #Plot Train Data
    if (print_ind):
        plot_data(x_train)

    #Generate the Quadratic Feature
    x_train_qudratic = np.hstack((x_train,(x_train[:,0]**2).reshape(x_train.shape[0],1),(x_train[:,1]**2).reshape(x_train.shape[0],1),(x_train[:,0]*x_train[:,1]).reshape(x_train.shape[0],1)))
    x_test_quadratic = np.hstack((x_test,(x_test[:,0]**2).reshape(x_test.shape[0],1),(x_test[:,1]**2).reshape(x_test.shape[0],1),(x_test[:,0]*x_test[:,1]).reshape(x_test.shape[0],1)))
    

    #Linear Regression Prediction (Non Quadrtic)
    lin_reg = LinearRegression()
    lin_reg.fit(x_train,y_train)
    y_train_pred = linear_model_predict(data=x_train,model = lin_reg)
    y_test_pred = linear_model_predict(data=x_test,model = lin_reg)
    lin_error_train = error_cal(y_train,y_train_pred)
    lin_error_test = error_cal(y_test,y_test_pred)
    lin_error[i,0] = lin_error_train
    lin_error[i,1] = lin_error_test
    
    #Bayes Prediction (Non Quadratic)
    y_train_pred = bayes_predict(data=x_train)
    y_test_pred = bayes_predict(data=x_test)
    bayes_error_train = error_cal(y_train,y_train_pred)
    bayes_error_test = error_cal(y_test,y_test_pred)
    bayes_lin_error[i,0] = bayes_error_train
    bayes_lin_error[i,1] = bayes_error_test
        
    #Linear Regression Prediction (Quadratic)
    lin_reg = LinearRegression()
    lin_reg.fit(x_train_qudratic,y_train)
    y_train_pred = linear_model_predict(data=x_train_qudratic,model = lin_reg)
    y_test_pred = linear_model_predict(data=x_test_quadratic,model = lin_reg)
    lin_quad_error_train = error_cal(y_train,y_train_pred)
    lin_quad_error_test = error_cal(y_test,y_test_pred)
    lin_quad_error[i,0] = lin_quad_error_train
    lin_quad_error[i,1] = lin_quad_error_test
    
    #K Nearest Neighbors
    knn_error_for_k = np.zeros(k.shape[0])
    for val in k:
        x_train_train,x_train_val,y_train_train,y_train_val = train_test_split(x_train,y_train,test_size=0.10,random_state = random_seed)
        knn = KNeighborsClassifier(n_neighbors=val)
        knn.fit(x_train_train,y_train_train)
        y_train_predict = knn.predict(x_train_val)
        
        knn_error = error_cal(y_train_val,y_train_predict)
        knn_error_for_k[val-1] = knn_error
        
        #print ("y_train_val:{}".format(y_train_val))
        #print ("y_train_pre:{}".format(y_train_predict))
        #print ("Value of k:{} Error:{}".format(val,knn_error))
    print ("Error Found for the K: {}".format(pd.Series(knn_error_for_k).value_counts()))
    print ("K Value - Minimim Error Found: {}".format(np.argmin(knn_error_for_k)+1))
    print ("Minimim Error Found for K Value: {}".format(knn_error_for_k[np.argmin(knn_error_for_k)]))
    

    #Bayes Prediction (Quadratic)    
    #y_train_pred = bayes_predict(data=x_train_qudratic)
    #y_test_pred = bayes_predict(data=x_test_quadratic)
    #bayes_quad_error_train = eror_cal(y_train,y_train_pred)
    #bayes_quad_error_test = eror_cal(y_test,y_test_pred)
    #bayes_lin_quad_error[i,0] = bayes_quad_error_train
    #bayes_lin_quad_error[i,1] = bayes_quad_error_test
    
    
    if (print_ind):
        print ("Iteration : {} Linear Error : Train : {} : Test: {}".format(i,lin_error_train,lin_error_test))
        print ("Iteration : {} Bayes Error  : Train : {} : Test: {}".format(i,bayes_error_train,bayes_error_test))
        
        print ("Iteration : {} Linear Quad Error : Train : {} : Test: {}".format(i,lin_quad_error_train,lin_quad_error_test))
        #print ("Iteration : {} Bayes Quad Error  : Train : {} : Test: {}".format(i,bayes_quad_error_train,bayes_quad_error_test))



#Plot Box Plot
labels = ["Train Error","Test Error"]
#plot_boxplot(data=lin_error, labels = labels, title = "Box Plot for Linear Regression")
#plot_boxplot(data=lin_quad_error, labels = labels, title = "Box Plot for Linear Regression - Quadratic")
#plot_boxplot(data=bayes_lin_error, labels = labels, title = "Box Plot for Bayes Error - Non Quadratic")

#plot_boxplot(data=bayes_lin_quad_error, labels = labels, title = "Box Plot for Bayes Error - Quadratic")


Error Found for the K: 0.10    110
0.15     38
0.20     13
0.30      8
0.25      4
0.55      3
0.40      2
0.35      1
0.50      1
dtype: int64
K Value - Minimim Error Found: 31
Minimim Error Found for K Value: 0.1
Error Found for the K: 0.25    94
0.20    59
0.30    13
0.15     4
0.55     3
0.45     2
0.40     2
0.35     2
0.50     1
dtype: int64
K Value - Minimim Error Found: 134
Minimim Error Found for K Value: 0.15
Error Found for the K: 0.10    121
0.15     40
0.05      7
0.20      7
0.55      3
0.30      1
0.25      1
dtype: int64
K Value - Minimim Error Found: 107
Minimim Error Found for K Value: 0.05
Error Found for the K: 0.30    63
0.35    63
0.25    35
0.40    14
0.55     3
0.20     2
dtype: int64
K Value - Minimim Error Found: 29
Minimim Error Found for K Value: 0.2
Error Found for the K: 0.10    84
0.15    45
0.20    20
0.30    15
0.25     9
0.55     4
0.50     2
0.35     1
dtype: int64
K Value - Minimim Error Found: 7
Minimim Error Found for K Value: 0.1
Error Found for t

In [10]:
np.min(knn_error_for_k)

0.15

In [11]:
knn_error_for_k[np.argmin(knn_error_for_k)]

0.15

In [6]:
np.argmin(knn_error_for_k)

2