# Iris

In [75]:
import numpy as np 
from sklearn.neighbors import NearestCentroid, KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


def run(x_train, y_train, x_test, y_test, clf):
    clf.fit(x_train, y_train)
    print(f"predictions: {clf.predict(x_test)}")
    print(f"Actual labels: {y_test}")
    print("score: %0.4f" % clf.score(x_test, y_test))
    print()


def main():
    x = np.load("./data/iris/iris_features.npy")
    y = np.load("./data/iris/iris_labels.npy")

    N = 120
    x_train = x[:N]
    y_train = y[:N]
    x_test = x[N:]
    y_test = y[N:]

    xa_train=np.load("./data/iris/iris_train_features_augmented.npy")
    ya_train=np.load("./data/iris/iris_train_labels_augmented.npy")
    xa_test =np.load("./data/iris/iris_test_features_augmented.npy")
    ya_test =np.load("./data/iris/iris_test_labels_augmented.npy")

    print("Nearest centroid:")
    run(x_train, y_train, x_test, y_test, NearestCentroid())
    print("k-NN classifier (k=3):")
    run(x_train, y_train, x_test, y_test, KNeighborsClassifier(n_neighbors=3))
    print("Naive Bayes classifier (Gaussian):")
    run(x_train, y_train, x_test, y_test, GaussianNB())
    print("Naive Bayes classifier (Multinomial):")
    run(x_train, y_train, x_test, y_test, MultinomialNB())
    print("Decision Tree classifier:")
    run(x_train, y_train, x_test, y_test, DecisionTreeClassifier())
    print("Random Forest classifier (estimators=5):")
    run(xa_train, ya_train, xa_test, ya_test, RandomForestClassifier(n_estimators=5))

    print("SVM (linear, C=1.0):")
    run(xa_train, ya_train, xa_test, ya_test, SVC(kernel="linear", C=1.0))
    print("SVM (RBF, C=1.0, gamma=0.25):")
    run(xa_train, ya_train, xa_test, ya_test, SVC(kernel="rbf", C=1.0, gamma=0.25))
    print("SVM (RBF, C=1.0, gamma=0.001, augmented)")
    run(xa_train, ya_train, xa_test, ya_test, SVC(kernel="rbf", C=1.0, gamma=0.001))
    print("SVM (RBF, C=1.0, gamma=0.001, original)")
    run(x_train, y_train, x_test, y_test, SVC(kernel="rbf", C=1.0, gamma=0.001))

main()
    


Nearest centroid:
predictions: [0 0 1 0 0 2 1 1 1 1 0 0 1 2 1 2 1 1 0 0 2 1 2 0 0 1 2 1 1 1]
Actual labels: [0 0 1 0 0 2 2 1 1 1 0 0 1 2 2 2 1 1 0 0 2 2 2 0 0 1 2 1 1 2]
score: 0.8667

k-NN classifier (k=3):
predictions: [0 0 1 0 0 2 1 1 1 1 0 0 1 1 2 2 1 1 0 0 2 2 2 0 0 1 2 1 1 2]
Actual labels: [0 0 1 0 0 2 2 1 1 1 0 0 1 2 2 2 1 1 0 0 2 2 2 0 0 1 2 1 1 2]
score: 0.9333

Naive Bayes classifier (Gaussian):
predictions: [0 0 1 0 0 2 1 1 1 1 0 0 1 1 2 2 1 1 0 0 2 2 2 0 0 1 2 1 1 2]
Actual labels: [0 0 1 0 0 2 2 1 1 1 0 0 1 2 2 2 1 1 0 0 2 2 2 0 0 1 2 1 1 2]
score: 0.9333

Naive Bayes classifier (Multinomial):
predictions: [0 0 1 0 0 2 2 1 1 1 0 0 1 1 2 2 1 1 0 0 2 2 2 0 0 1 2 1 1 2]
Actual labels: [0 0 1 0 0 2 2 1 1 1 0 0 1 2 2 2 1 1 0 0 2 2 2 0 0 1 2 1 1 2]
score: 0.9667

Decision Tree classifier:
predictions: [0 0 1 0 0 2 1 1 1 1 0 0 1 1 1 2 1 1 0 0 2 1 2 0 0 1 2 1 1 2]
Actual labels: [0 0 1 0 0 2 2 1 1 1 0 0 1 2 2 2 1 1 0 0 2 2 2 0 0 1 2 1 1 2]
score: 0.8667

Random Forest classifier 

# Breast Cancer

In [91]:
import numpy as np
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 

def run(x_train, y_train, x_test, y_test, clf):
    clf.fit(x_train, y_train)
    print("    score = %0.4f" % clf.score(x_test, y_test))
    print()

def main():
    x = np.load("./data/breast_cancer/bc_features_normalized.npy")
    y = np.load("./data/breast_cancer/bc_labels.npy")
    N = 455 
    x_train = x[:N];  x_test = x[N:]
    y_train = y[:N];  y_test = y[N:]

    print("Nearest centroid:")
    run(x_train, y_train, x_test, y_test, NearestCentroid())
    print("k-NN classifier (k=3):")
    run(x_train, y_train, x_test, y_test, KNeighborsClassifier(n_neighbors=3))
    print("k-NN classifier (k=7):")
    run(x_train, y_train, x_test, y_test, KNeighborsClassifier(n_neighbors=7))
    print("Naive Bayes classifier (Gaussian):")
    run(x_train, y_train, x_test, y_test, GaussianNB())
    print("Decision Tree classifier:")
    run(x_train, y_train, x_test, y_test, DecisionTreeClassifier())
    print("Random Forest classifier (estimators=5):")
    run(x_train, y_train, x_test, y_test, RandomForestClassifier(n_estimators=5))
    print("Random Forest classifier (estimators=50):")
    run(x_train, y_train, x_test, y_test, RandomForestClassifier(n_estimators=50))
    print("SVM (linear, C=1.0):")
    run(x_train, y_train, x_test, y_test, SVC(kernel="linear", C=1.0))
    print("SVM (RBF, C=1.0, gamma=0.03333):")
    run(x_train, y_train, x_test, y_test, SVC(kernel="rbf", C=1.0, gamma=0.03333))

main()


Nearest centroid:
    score = 0.9386

k-NN classifier (k=3):
    score = 0.9474

k-NN classifier (k=7):
    score = 0.9737

Naive Bayes classifier (Gaussian):
    score = 0.9561

Decision Tree classifier:
    score = 0.9035

Random Forest classifier (estimators=5):
    score = 0.9298

Random Forest classifier (estimators=50):
    score = 0.9561

SVM (linear, C=1.0):
    score = 0.9825

SVM (RBF, C=1.0, gamma=0.03333):
    score = 0.9825




Running the code multiple times, we observe that, Nearest centroid, K-NN, Naive Bayes, SVM keeps the same score i.e, its constant for the fixed dataset. However, score changes for decision tree and random forest, that is becuse sklearn uses pseudo random sequence to generate random forest and decision tree models. We can add puesdorandom number seed to get the same random sequence to fix this issue. 

In [92]:
import numpy as np
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 

def run(x_train, y_train, x_test, y_test, clf):
    clf.fit(x_train, y_train)
    print("    score = %0.4f" % clf.score(x_test, y_test))
    print()

def main():
    x = np.load("./data/breast_cancer/bc_features_normalized.npy")
    y = np.load("./data/breast_cancer/bc_labels.npy")

    # Effects of random split
    np.random.seed(12345) # this will fix the ordering pseudo random sequence that sklearn uses to generate tree models --> No change in the output (Decision tree, Random forest)
    N = 455 
    x_train = x[:N];  x_test = x[N:]
    y_train = y[:N];  y_test = y[N:]

    print("Nearest centroid:")
    run(x_train, y_train, x_test, y_test, NearestCentroid())
    print("k-NN classifier (k=3):")
    run(x_train, y_train, x_test, y_test, KNeighborsClassifier(n_neighbors=3))
    print("k-NN classifier (k=7):")
    run(x_train, y_train, x_test, y_test, KNeighborsClassifier(n_neighbors=7))
    print("Naive Bayes classifier (Gaussian):")
    run(x_train, y_train, x_test, y_test, GaussianNB())
    print("Decision Tree classifier:")
    run(x_train, y_train, x_test, y_test, DecisionTreeClassifier())
    print("Random Forest classifier (estimators=5):")
    run(x_train, y_train, x_test, y_test, RandomForestClassifier(n_estimators=5))
    print("Random Forest classifier (estimators=50):")
    run(x_train, y_train, x_test, y_test, RandomForestClassifier(n_estimators=50))
    print("SVM (linear, C=1.0):")
    run(x_train, y_train, x_test, y_test, SVC(kernel="linear", C=1.0))
    print("SVM (RBF, C=1.0, gamma=0.03333):")
    run(x_train, y_train, x_test, y_test, SVC(kernel="rbf", C=1.0, gamma=0.03333))

main()


Nearest centroid:
    score = 0.9386

k-NN classifier (k=3):
    score = 0.9474

k-NN classifier (k=7):
    score = 0.9737

Naive Bayes classifier (Gaussian):
    score = 0.9561

Decision Tree classifier:
    score = 0.9298

Random Forest classifier (estimators=5):
    score = 0.9474

Random Forest classifier (estimators=50):
    score = 0.9561

SVM (linear, C=1.0):
    score = 0.9825

SVM (RBF, C=1.0, gamma=0.03333):
    score = 0.9825



with random seed fixed, score doesn't change(between multiple runs)for any models! great! But also observe that scores for this and previous run is same! i.e same scores for fixed dataset. 

What happens if we change the dataset ? What if we want to randomly select the datasets ?  

idx = np.argsort(np.random.random(y.shape[0]))    
    x = x[idx]
    y = y[idx]
by adding above lines, we will randomly select the dataset. 

In [95]:
import numpy as np
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 

def run(x_train, y_train, x_test, y_test, clf):
    clf.fit(x_train, y_train)
    print("    score = %0.4f" % clf.score(x_test, y_test))
    print()

def main():
    x = np.load("./data/breast_cancer/bc_features_normalized.npy")
    y = np.load("./data/breast_cancer/bc_labels.npy")

    # Effects of random split
    np.random.seed(12345) # this will fix the ordering pseudo random sequence that sklearn uses to generate tree models --> No change in the output (Decision tree, Random forest)
    idx = np.argsort(np.random.random(y.shape[0]))    
    x = x[idx]
    y = y[idx]
    N = 455 
    x_train = x[:N];  x_test = x[N:]
    y_train = y[:N];  y_test = y[N:]

    print("Nearest centroid:")
    run(x_train, y_train, x_test, y_test, NearestCentroid())
    print("k-NN classifier (k=3):")
    run(x_train, y_train, x_test, y_test, KNeighborsClassifier(n_neighbors=3))
    print("k-NN classifier (k=7):")
    run(x_train, y_train, x_test, y_test, KNeighborsClassifier(n_neighbors=7))
    print("Naive Bayes classifier (Gaussian):")
    run(x_train, y_train, x_test, y_test, GaussianNB())
    print("Decision Tree classifier:")
    run(x_train, y_train, x_test, y_test, DecisionTreeClassifier())
    print("Random Forest classifier (estimators=5):")
    run(x_train, y_train, x_test, y_test, RandomForestClassifier(n_estimators=5))
    print("Random Forest classifier (estimators=50):")
    run(x_train, y_train, x_test, y_test, RandomForestClassifier(n_estimators=50))
    print("SVM (linear, C=1.0):")
    run(x_train, y_train, x_test, y_test, SVC(kernel="linear", C=1.0))
    print("SVM (RBF, C=1.0, gamma=0.03333):")
    run(x_train, y_train, x_test, y_test, SVC(kernel="rbf", C=1.0, gamma=0.03333))

main()


Nearest centroid:
    score = 0.9123

k-NN classifier (k=3):
    score = 0.9825

k-NN classifier (k=7):
    score = 0.9825

Naive Bayes classifier (Gaussian):
    score = 0.9035

Decision Tree classifier:
    score = 0.9298

Random Forest classifier (estimators=5):
    score = 0.9561

Random Forest classifier (estimators=50):
    score = 0.9386

SVM (linear, C=1.0):
    score = 0.9825

SVM (RBF, C=1.0, gamma=0.03333):
    score = 0.9737



Scores doesn't change between runs but this is totally different scores compared to first run for example, scores for nearest centroid changed from 0.9386 to 0.9123 score got worse!!!. This is becuase, we changed the dataset. If we change the dataset again, scores may get improved or gets even worse. For some selection of datasets, score improves and for some scores gets worse. K-fold validation helps to remove this dependency on dataset. 

But, adding random.seed(12345) not only fixed the selection sequence for dataset but also the pseudo random sequence that sklearn uses to generate tree models. by resetting the random.seed(), one can fix sequence the data selection but removes this constraint for the pseudo random sequence that sklearn uses to generate tree models.

In [None]:
x = np.load("./data/breast_cancer/bc_features_normalized.npy")
y = np.load("./data/breast_cancer/bc_labels.npy")

# Controlling random split --> random, but everytime we get the same random idx




In [None]:
import numpy as np
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 

def run(x_train, y_train, x_test, y_test, clf):
    clf.fit(x_train, y_train)
    print("    score = %0.4f" % clf.score(x_test, y_test))
    print()

def main():
    x = np.load("./data/breast_cancer/bc_features_normalized.npy")
    y = np.load("./data/breast_cancer/bc_labels.npy")

    # Controlling random split --> random, but everytime we get the same random idx
    np.random.seed(12345) 
    idx = np.argsort(np.random.random(y.shape[0]))    
    x = x[idx]
    y = y[idx]

    N = 455 
    x_train = x[:N];  x_test = x[N:]
    y_train = y[:N];  y_test = y[N:]

    # resetting the random seed before training and testing so that sklearn can use the psuedo random sequence to generate random tree models 
    np.random.seed() 

    print("Nearest centroid:")
    run(x_train, y_train, x_test, y_test, NearestCentroid())
    print("k-NN classifier (k=3):")
    run(x_train, y_train, x_test, y_test, KNeighborsClassifier(n_neighbors=3))
    print("k-NN classifier (k=7):")
    run(x_train, y_train, x_test, y_test, KNeighborsClassifier(n_neighbors=7))
    print("Naive Bayes classifier (Gaussian):")
    run(x_train, y_train, x_test, y_test, GaussianNB())
    print("Decision Tree classifier:")
    run(x_train, y_train, x_test, y_test, DecisionTreeClassifier())
    print("Random Forest classifier (estimators=5):")
    run(x_train, y_train, x_test, y_test, RandomForestClassifier(n_estimators=5))
    print("Random Forest classifier (estimators=50):")
    run(x_train, y_train, x_test, y_test, RandomForestClassifier(n_estimators=50))
    print("SVM (linear, C=1.0):")
    run(x_train, y_train, x_test, y_test, SVC(kernel="linear", C=1.0))
    print("SVM (RBF, C=1.0, gamma=0.03333):")
    run(x_train, y_train, x_test, y_test, SVC(kernel="rbf", C=1.0, gamma=0.03333))

main()


### K-fold validation



In [118]:
import numpy as np
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 

import sys


def run(x_train, y_train, x_test, y_test, clf):
    clf.fit(x_train, y_train)
    return clf.score(x_test, y_test)

def split(x, y, k, m):
    ns = int(y.shape[0]/m)
    s = []
    for i in range(m):
        s.append( [ x[(ns*i):(ns*i+ns)],
                    y[(ns*i):(ns*i+ns)] ] )
    
    x_test, y_test = s[k]
    x_train = []
    y_train = []

    for i in range(m):
        if i == k:
            continue
        else:
            a,b = s[i]
            x_train.append(a)
            y_train.append(b)
    
    x_train = np.array(x_train).reshape((m-1)*ns, 30)
    y_train = np.array(y_train).reshape((m-1)*ns)
    return [x_train, y_train, x_test, y_test]

def pp(z, k, s):
    m = z.shape[1]
    print("%-19s: %0.4f +/- %0.4f | " %(s, z[k].mean(), z[k].std()/np.sqrt(m)), end='\n')
    # for i in range(m):
    #     print("%0.4f " %z[k,i], end='\n')

def main():
    x = np.load("./data/breast_cancer/bc_features_normalized.npy")
    y = np.load("./data/breast_cancer/bc_labels.npy")
    idx = np.argsort(np.random.random(y.shape[0]))
    x = x[idx]
    y = y[idx]
    m = 5
    z = np.zeros((8,m))

    for k in range(m):
        x_train, y_train, x_test, y_test = split(x,y,k,m)
        z[0,k] = run(x_train, y_train, x_test, y_test, NearestCentroid())
        z[1,k] = run(x_train, y_train, x_test, y_test, KNeighborsClassifier(n_neighbors=3))
        z[2,k] = run(x_train, y_train, x_test, y_test, KNeighborsClassifier(n_neighbors=7))
        z[3,k] = run(x_train, y_train, x_test, y_test, GaussianNB())
        z[4,k] = run(x_train, y_train, x_test, y_test, DecisionTreeClassifier())
        z[5,k] = run(x_train, y_train, x_test, y_test, RandomForestClassifier(n_estimators=5))
        z[6,k] = run(x_train, y_train, x_test, y_test, RandomForestClassifier(n_estimators=50))
        z[7,k] = run(x_train, y_train, x_test, y_test, SVC(kernel="linear", C=1.0))

    pp(z,0,"Nearest")
    pp(z,1,"3-NN")
    pp(z,2,"7-NN")  
    pp(z,3,"Naive Bayes")
    pp(z,4,"Decision Tree")
    pp(z,5,"Random Forest (5)")
    pp(z,6,"Random Forest (50)")
    pp(z,7,"SVM (linear)")

main()
    

Nearest            : 0.9274 +/- 0.0081 | 
3-NN               : 0.9664 +/- 0.0063 | 
7-NN               : 0.9681 +/- 0.0054 | 
Naive Bayes        : 0.9381 +/- 0.0035 | 
Decision Tree      : 0.9186 +/- 0.0081 | 
Random Forest (5)  : 0.9398 +/- 0.0088 | 
Random Forest (50) : 0.9628 +/- 0.0098 | 
SVM (linear)       : 0.9664 +/- 0.0068 | 


# Hyper Parameter Tuning

Lets fine tune k-NN classifier. we will vary $K = [1,3,5,7,9,11,13,15]$ K = 11 seems to the best but you need to run 1000 times and see the results again

In [119]:
import numpy as np
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 

import sys


def run(x_train, y_train, x_test, y_test, clf):
    clf.fit(x_train, y_train)
    return clf.score(x_test, y_test)

def split(x, y, k, m):
    ns = int(y.shape[0]/m)
    s = []
    for i in range(m):
        s.append( [ x[(ns*i):(ns*i+ns)],
                    y[(ns*i):(ns*i+ns)] ] )
    
    x_test, y_test = s[k]
    x_train = []
    y_train = []

    for i in range(m):
        if i == k:
            continue
        else:
            a,b = s[i]
            x_train.append(a)
            y_train.append(b)
    
    x_train = np.array(x_train).reshape((m-1)*ns, 30)
    y_train = np.array(y_train).reshape((m-1)*ns)
    return [x_train, y_train, x_test, y_test]

def pp(z, k, s):
    m = z.shape[1]
    print("%-19s: %0.4f +/- %0.4f | " %(s, z[k].mean(), z[k].std()/np.sqrt(m)), end='\n')
    # for i in range(m):
    #     print("%0.4f " %z[k,i], end='\n')

def main():
    x = np.load("./data/breast_cancer/bc_features_normalized.npy")
    y = np.load("./data/breast_cancer/bc_labels.npy")
    idx = np.argsort(np.random.random(y.shape[0]))
    x = x[idx]
    y = y[idx]
    m = 5
    z = np.zeros((8,m))

    n_hp = [1,3,5,7,9,11,13,15]
    for k in range(m):
        x_train, y_train, x_test, y_test = split(x,y,k,m)
        for n in range(len(n_hp)):
            z[n,k] = run(x_train, y_train, x_test, y_test, KNeighborsClassifier(n_neighbors=n_hp[n]))


    pp(z,0,"1-NN")
    pp(z,1,"3-NN")
    pp(z,2,"5-NN")  
    pp(z,3,"7-NN")
    pp(z,4,"9-NN")
    pp(z,5,"11-NN")
    pp(z,6,"13-NN")
    pp(z,7,"15-NN")

main()
    

1-NN               : 0.9540 +/- 0.0046 | 
3-NN               : 0.9628 +/- 0.0039 | 
5-NN               : 0.9628 +/- 0.0039 | 
7-NN               : 0.9664 +/- 0.0039 | 
9-NN               : 0.9664 +/- 0.0039 | 
11-NN              : 0.9699 +/- 0.0047 | 
13-NN              : 0.9664 +/- 0.0030 | 
15-NN              : 0.9611 +/- 0.0040 | 


### Random forest
Let's vary the number of trees, $n_t = [5,20,50,100,200,500,1000,5000]$ number for trees = 100 seems to be good. Run it for 1000 times and see.

In [120]:
import numpy as np
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 

import sys


def run(x_train, y_train, x_test, y_test, clf):
    clf.fit(x_train, y_train)
    return clf.score(x_test, y_test)

def split(x, y, k, m):
    ns = int(y.shape[0]/m)
    s = []
    for i in range(m):
        s.append( [ x[(ns*i):(ns*i+ns)],
                    y[(ns*i):(ns*i+ns)] ] )
    
    x_test, y_test = s[k]
    x_train = []
    y_train = []

    for i in range(m):
        if i == k:
            continue
        else:
            a,b = s[i]
            x_train.append(a)
            y_train.append(b)
    
    x_train = np.array(x_train).reshape((m-1)*ns, 30)
    y_train = np.array(y_train).reshape((m-1)*ns)
    return [x_train, y_train, x_test, y_test]

def pp(z, k, s):
    m = z.shape[1]
    print("%-19s: %0.4f +/- %0.4f | " %(s, z[k].mean(), z[k].std()/np.sqrt(m)), end='\n')
    # for i in range(m):
    #     print("%0.4f " %z[k,i], end='\n')

def main():
    x = np.load("./data/breast_cancer/bc_features_normalized.npy")
    y = np.load("./data/breast_cancer/bc_labels.npy")
    idx = np.argsort(np.random.random(y.shape[0]))
    x = x[idx]
    y = y[idx]
    m = 5
    z = np.zeros((8,m))

    n_hp = [5,20,50,100,200,500,1000,5000]
    for k in range(m):
        x_train, y_train, x_test, y_test = split(x,y,k,m)
        for n in range(len(n_hp)):
            z[n,k] = run(x_train, y_train, x_test, y_test, RandomForestClassifier(n_estimators=n_hp[n]))


    pp(z,0,"Random Forest (5)")
    pp(z,1,"Random Forest (20)")
    pp(z,2,"Random Forest (50)")  
    pp(z,3,"Random Forest (100)")
    pp(z,4,"Random Forest (200))")
    pp(z,5,"Random Forest (500)")
    pp(z,6,"Random Forest (1000)")
    pp(z,7,"Random Forest (5000)")

main()
    

Random Forest (5)  : 0.9504 +/- 0.0178 | 
Random Forest (20) : 0.9611 +/- 0.0129 | 
Random Forest (50) : 0.9593 +/- 0.0102 | 
Random Forest (100): 0.9628 +/- 0.0116 | 
Random Forest (200)): 0.9558 +/- 0.0115 | 
Random Forest (500): 0.9611 +/- 0.0114 | 
Random Forest (1000): 0.9575 +/- 0.0129 | 
Random Forest (5000): 0.9593 +/- 0.0116 | 


### SVM's 

single hyper parameter search, Fudge factor $C = [0.001, 0.01, 0.1, 1.0, 2.0, 10.0, 50.0, 100.0]$ C = 10 seems to be good. Run it 1000 times and see

In [121]:
import numpy as np
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 

import sys


def run(x_train, y_train, x_test, y_test, clf):
    clf.fit(x_train, y_train)
    return clf.score(x_test, y_test)

def split(x, y, k, m):
    ns = int(y.shape[0]/m)
    s = []
    for i in range(m):
        s.append( [ x[(ns*i):(ns*i+ns)],
                    y[(ns*i):(ns*i+ns)] ] )
    
    x_test, y_test = s[k]
    x_train = []
    y_train = []

    for i in range(m):
        if i == k:
            continue
        else:
            a,b = s[i]
            x_train.append(a)
            y_train.append(b)
    
    x_train = np.array(x_train).reshape((m-1)*ns, 30)
    y_train = np.array(y_train).reshape((m-1)*ns)
    return [x_train, y_train, x_test, y_test]

def pp(z, k, s):
    m = z.shape[1]
    print("%-19s: %0.4f +/- %0.4f | " %(s, z[k].mean(), z[k].std()/np.sqrt(m)), end='\n')
    # for i in range(m):
    #     print("%0.4f " %z[k,i], end='\n')

def main():
    x = np.load("./data/breast_cancer/bc_features_normalized.npy")
    y = np.load("./data/breast_cancer/bc_labels.npy")
    idx = np.argsort(np.random.random(y.shape[0]))
    x = x[idx]
    y = y[idx]
    m = 5
    z = np.zeros((8,m))

    n_hp = [0.001, 0.01, 0.1, 1.0, 2.0, 10.0, 50.0, 100.0]
    for k in range(m):
        x_train, y_train, x_test, y_test = split(x,y,k,m)
        for n in range(len(n_hp)):
            z[n,k] = run(x_train, y_train, x_test, y_test, SVC(kernel="linear", C=n_hp[n]))


    pp(z,0,"SVM (C = 0.001)")
    pp(z,1,"SVM (C = 0.01)")
    pp(z,2,"SVM (C = 0.1)")  
    pp(z,3,"SVM (C = 1)")
    pp(z,4,"SVM (C = 2)")
    pp(z,5,"SVM (C = 10)")
    pp(z,6,"SVM (C = 50)")
    pp(z,7,"SVM (C = 100)")

main()
    

SVM (C = 0.001)    : 0.9381 +/- 0.0043 | 
SVM (C = 0.01)     : 0.9664 +/- 0.0046 | 
SVM (C = 0.1)      : 0.9717 +/- 0.0039 | 
SVM (C = 1)        : 0.9646 +/- 0.0025 | 
SVM (C = 2)        : 0.9646 +/- 0.0000 | 
SVM (C = 10)       : 0.9735 +/- 0.0056 | 
SVM (C = 50)       : 0.9593 +/- 0.0081 | 
SVM (C = 100)      : 0.9628 +/- 0.0085 | 


SVM, dual parameter search, Fudge factor $C = [0.001, 0.01, 0.1, 1.0, 2.0, 10.0, 50.0, 100.0]$ and $RBF = (1./30)*2.0**np.array([-4,-3,-2,-1,0,1,2,3])$ as a grid search (2D array of C & RBF). We can also run an optimizer to optimize both values as these are continuous values not discrete values. C = 10 seems to be good. Run it 1000 times and see

In [122]:
import numpy as np
from sklearn.svm import SVC 

def run(x_train, y_train, x_test, y_test, clf):
    clf.fit(x_train, y_train)
    return clf.score(x_test, y_test)

def split(x,y,k,m):
    ns = int(y.shape[0]/m)
    s = []
    for i in range(m):
        s.append([x[(ns*i):(ns*i+ns)], y[(ns*i):(ns*i+ns)]])
    x_test, y_test = s[k]
    x_train = []
    y_train = []
    for i in range(m):
        if (i==k):
            continue
        else:
            a,b = s[i]
            x_train.append(a)
            y_train.append(b)
    x_train = np.array(x_train).reshape(((m-1)*ns,30))
    y_train = np.array(y_train).reshape((m-1)*ns)
    return [x_train, y_train, x_test, y_test]

def main():
    m = 5 
    x = np.load("./data/breast_cancer/bc_features_normalized.npy")
    y = np.load("./data/breast_cancer/bc_labels.npy")
    idx = np.argsort(np.random.random(y.shape[0]))
    x = x[idx]
    y = y[idx]

    Cs = np.array([0.01,0.1,1.0,2.0,10.0,50.0,100.0])
    gs = (1./30)*2.0**np.array([-4,-3,-2,-1,0,1,2,3])
    zmax = 0.0 
    for C in Cs: 
        for g in gs: 
            z = np.zeros(m)
            for k in range(m):
                x_train, y_train, x_test, y_test = split(x,y,k,m)
                z[k] = run(x_train, y_train, x_test, y_test, SVC(C=C,gamma=g,kernel="rbf"))
            if (z.mean() > zmax):
                zmax = z.mean()
                bestC = C 
                bestg = g 
    print("best C     = %0.5f" % bestC)
    print("     gamma = %0.5f" % bestg)
    print("   accuracy= %0.5f" % zmax)

main()

best C     = 10.00000
     gamma = 0.00208
   accuracy= 0.98053


# MNIST