In [1]:
import sklearn as sk
import numpy as np
import pandas as pd

In [2]:
from sklearn import svm

In [3]:
from sklearn.metrics import confusion_matrix

In [4]:
X = np.load(r'D:\Learning\Cancer_Project\Data\Final X And Y Matrices\final_matrix.npy')
Y = np.load(r'D:\Learning\Cancer_Project\Data\Final X And Y Matrices\sample_images_Y.npy')

In [5]:
print("X matrix with shape {} has been loaded".format(X.shape))

X matrix with shape (3408, 4096) has been loaded


In [6]:
print("Y matrix with shape {} has been loaded".format(Y.shape))

Y matrix with shape (3408,) has been loaded


In [7]:
X = X.astype(int)
Y = Y.astype(int)

In [8]:
X.dtype

dtype('int32')

In [9]:
Y.dtype

dtype('int32')

In [10]:
def get_train_test_data(np_matrix):
    seed = 2000
    df = pd.DataFrame(np_matrix)
    from sklearn.model_selection import train_test_split
    train, test = train_test_split(df, test_size = 0.2, random_state=seed)
    train = np.asarray(train)
    test = np.asarray(test)
    return train,test

In [11]:
train_x,test_x = get_train_test_data(X)

In [12]:
train_x.shape

(2726, 4096)

In [13]:
test_x.shape

(682, 4096)

In [14]:
print('Train and test data created for X matrix created with dimensions {} and {} respectively'.format(train_x.shape,test_x.shape))

Train and test data created for X matrix created with dimensions (2726, 4096) and (682, 4096) respectively


In [15]:
train_y,test_y = get_train_test_data(Y)

In [16]:
train_y.shape

(2726, 1)

In [17]:
test_y.shape

(682, 1)

In [18]:
print('Train and test data created for Y matrix created with dimensions {} and {} respectively'.format(train_y.shape,test_y.shape))

Train and test data created for Y matrix created with dimensions (2726, 1) and (682, 1) respectively


In [19]:
def reshape_label_matrix(train_y):
    c, r = train_y.shape
    train_y = train_y.reshape(c,)
    return train_y

In [20]:
train_y = reshape_label_matrix(train_y)

In [21]:
train_y.shape

(2726,)

In [22]:
train_y.dtype

dtype('int32')

### Run a simple rbf svm classifier

In [53]:
clf = svm.SVC(kernel='rbf')
clf.fit(train_x,train_y)
pred = clf.predict(test_x)
cm = confusion_matrix(test_y, pred)
print('Confusion matrix is \n {}'.format(cm))

Confusion matrix is 
 [[447   5]
 [176  54]]


### Run a linear svm classifier

In [54]:
clf = svm.SVC(kernel='linear')
clf.fit(train_x,train_y)
pred = clf.predict(test_x)
cm = confusion_matrix(test_y, pred)
print('Confusion matrix is \n {}'.format(cm))

Confusion matrix is 
 [[404  48]
 [ 89 141]]


### Run a poly svm classifier

In [None]:
clf = svm.SVC(kernel='poly')
clf.fit(train_x,train_y)
pred = clf.predict(test_x)
cm = confusion_matrix(test_y, pred)
print('Confusion matrix is \n {}'.format(cm))

### Run a sigmoid svm classifier

In [24]:
clf = svm.SVC(kernel='sigmoid')
clf.fit(train_x,train_y)
pred = clf.predict(test_x)
cm = confusion_matrix(test_y, pred)
print('Confusion matrix is \n {}'.format(cm))

Confusion matrix is 
 [[452   0]
 [230   0]]


### Run a precomputed svm classifier

In [26]:
train_x.shape

(2726, 4096)

In [27]:
train_y.shape

(2726,)

In [None]:
### Sample code of how precomputed svm works

In [45]:
from sklearn.datasets import load_digits
from sklearn.utils import shuffle
from sklearn.svm import SVC

In [36]:
digits = load_digits()
a, b = shuffle(digits.data, digits.target)
a_train, a_test = a[:1000, :], a[1000:, :]
b_train, b_test = b[:1000], b[1000:]

In [37]:
a_train.shape

(1000, 64)

In [38]:
a_train.T.shape

(64, 1000)

In [39]:
svc = SVC(kernel='precomputed')
kernel_train = np.dot(a_train, a_train.T)  # linear kernel

In [41]:
svc.fit(kernel_train, b_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto',
  kernel='precomputed', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [49]:
kernel_test = np.dot(a_test, a_train.T)
b_pred = svc.predict(kernel_test)
cm = confusion_matrix(b_test, b_pred)
print('Confusion matrix is \n {}'.format(cm))

Confusion matrix is 
 [[78  0  0  0  0  0  0  0  0  0]
 [ 0 80  0  0  0  0  0  0  2  0]
 [ 0  0 74  0  0  0  0  0  0  0]
 [ 0  0  1 87  0  1  0  1  0  1]
 [ 0  0  0  0 82  0  0  0  0  3]
 [ 0  0  0  1  0 73  0  0  0  0]
 [ 0  1  0  0  0  0 76  0  0  0]
 [ 0  0  0  0  0  0  0 81  0  1]
 [ 0  0  0  2  0  0  0  0 71  1]
 [ 0  0  0  1  0  3  0  0  3 73]]


In [51]:
##https://github.com/jeff1evesque/machine-learning/issues/2562

## Create a function to run all kinds of kernels

In [129]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']

In [130]:
kernels

['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']

In [None]:
def run_svm():
    # Declare a list of all possible kernels in svm.SVC
    kernels = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
    for kernel in kernels:
        print('Running {} kernel'.format(kernel))
        clf = svm.SVC(kernel=kernel)
        clf.fit(train_x,train_y)
        pred = clf.predict(test_x)
        cm = confusion_matrix(test_y, pred)
        print('Confusion matrix for {} is \n {}'.format(kernel,cm))
    
def main():
    run_svm()
    
if __name__ == '__main__':
    main()

## Function to run k-Nearest Neighbor Model

In [49]:
from sklearn import neighbors
from sklearn.model_selection import cross_val_score
knn=neighbors.KNeighborsClassifier()

In [50]:
def run_knn():
    knn.fit(train_x,train_y)
    pred = knn.predict(test_x)
    cm = confusion_matrix(test_y, pred)
    print('Confusion matrix is \n {}'.format(cm))

In [93]:
# Too many indices error while cross validating?? Check out this solution.
#https://stackoverflow.com/questions/31995175/scikit-learn-cross-val-score-too-many-indices-for-array

In [52]:
run_knn()

Confusion matrix is 
 [[414  38]
 [ 96 134]]


### Cross Validation of SVM Models

In [123]:
def svm_cv():
    clf = svm.SVC()
    scores = cross_val_score(clf, test_x, test_y, cv=10)
    print(scores)

In [124]:
type(scores)

NoneType

In [130]:
import timeit
from datetime import datetime 

start = timeit.default_timer()
start_time = datetime.now()
print('Execution started at {}'.format(start_time))

svm_cv()

stop = timeit.default_timer()
stop_time = datetime.now()
print('Execution ended at {}'.format(stop_time))
print('Total Execution time : {}'.format(stop - start))

Execution started at 2017-05-30 11:20:11.328251
[ 0.72463768  0.62318841  0.73529412  0.70588235  0.77941176  0.72058824
  0.73529412  0.73529412  0.73529412  0.73529412]
Execution ended at 2017-05-30 11:20:44.529439
Total Execution time : 33.20130227605705


### Random Forest

In [39]:
train_x.shape

(2726, 4096)

In [40]:
train_y.shape

(2726,)

In [41]:
test_x.shape

(682, 4096)

In [42]:
test_y.shape

(682, 1)

In [47]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=500)
clf = clf.fit(train_x, train_y)

In [48]:
pred = clf.predict(test_x)
cm = confusion_matrix(test_y, pred)
print('Confusion matrix is \n {}'.format(cm))

Confusion matrix is 
 [[445   7]
 [ 67 163]]


In [31]:
train_x.shape

(2726, 4096)

In [32]:
train_y.shape

(2726,)

In [47]:
n_estimators = 30
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),n_estimators=30)
clf = clf.fit(train_x, train_y)

In [48]:
pred = clf.predict(test_x)
cm = confusion_matrix(test_y, pred)
print('Confusion matrix is \n {}'.format(cm))

Confusion matrix is 
 [[413  39]
 [ 69 161]]


## Decision Trees

In [23]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor

In [None]:
clf = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0, n_estimators=200,
                            algorithm='SAMME')
scores = cross_val_score(clf, X, y)
scores.mean()