In [3]:
# Required Imports
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split


# ML Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [21]:
# Load the data and split into features and targets, X and y
cols = np.arange(2,32)
X = np.loadtxt("./data/wdbc.data",usecols=cols,delimiter=',') 
y = np.loadtxt("./data/wdbc.data",dtype=str,usecols=1,delimiter=',')

# Assign 'M' (Malignant) a value 1, and 'B' (Benign) a value 0
for i in range(len(y)):
    if y[i] == 'M':
        y[i] = 1
    else: 
        y[i] = 0

In [25]:
# Split the data
X_trn, X_tst, y_trn, y_tst = sklearn.model_selection.train_test_split(X, y, test_size=0.3, random_state=0)

In [26]:
# Scale the data
scaler = sklearn.preprocessing.StandardScaler()
X_trn = scaler.fit_transform(X_trn)
X_tst = scaler.fit_transform(X_tst)

## Logistic Regression (for classification)

In [27]:
# Test a few different hyperparameters for Logistic Regression: C, solver
c_vals_lr = [0.0001,0.001,0.01,0.1,1.0,10.0]
s_vals_lr = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

In [28]:
# Train different logistic regression models, using grid search and cross validation to find best hyperparameters.
lr = sklearn.linear_model.LogisticRegression(max_iter=10000,random_state=0)
param_grid={'C': c_vals_lr, 'solver' : s_vals_lr}
gs_lr = sklearn.model_selection.GridSearchCV(lr,param_grid,verbose=1,cv=3).fit(X_trn,y_trn) 
print(gs_lr.best_estimator_)
print(f'{gs_lr.best_estimator_.score(X_trn,y_trn)} train accuracy') 
print(f'{gs_lr.best_estimator_.score(X_tst,y_tst)} test accuracy')

Fitting 3 folds for each of 30 candidates, totalling 90 fits
LogisticRegression(max_iter=10000, random_state=0, solver='newton-cg')
0.9899497487437185 train accuracy
0.9649122807017544 test accuracy


## Support Vector Classification

In [29]:
# Test a few different hyperparameters for SVM: C, gamma
c_vals_svm = [0.01,0.1,1.0,10.0,100.0,1000.0]
g_vals_svm = [0.001,0.01,0.1,1.0,10.0]

In [31]:
# Train different svm models, using grid search and cross validation to find best hyperparameters.
svm = sklearn.svm.SVC(kernel='rbf',random_state=0)
param_grid={'C': c_vals_svm, 'gamma' : g_vals_svm}
gs_svm = sklearn.model_selection.GridSearchCV(svm,param_grid,verbose=1,cv=3).fit(X_trn,y_trn)
print(gs_svm.best_estimator_)
print(f'{gs_svm.best_estimator_.score(X_trn,y_trn)} train accuracy') 
print(f'{gs_svm.best_estimator_.score(X_tst,y_tst)} test accuracy')

Fitting 3 folds for each of 30 candidates, totalling 90 fits
SVC(C=1000.0, gamma=0.001, random_state=0)
0.9899497487437185 train accuracy
0.935672514619883 test accuracy


## Decision Tree Classification

In [32]:
# Test a few different hyperparameters for Decision Tree: criterion, max_depth, min_samples_leaf
crit_vals_dt = ['gini', 'entropy']
dep_vals_dt = [5, 10, 25, 50, 100]
samp_leaf_dt = [5, 10, 25, 50, 100]

In [33]:
# Train different DecisionTree models, using grid search and cross validation to find best hyperparameters.
dt = sklearn.tree.DecisionTreeClassifier(random_state=0)
param_grid={'criterion': crit_vals_dt, 'max_depth' : dep_vals_dt, 'min_samples_leaf' : samp_leaf_dt}
gs_dt = sklearn.model_selection.GridSearchCV(dt,param_grid,verbose=1,cv=3).fit(X_trn,y_trn)
print(gs_dt.best_estimator_)
print(f'{gs_dt.best_estimator_.score(X_trn,y_trn)} train accuracy') 
print(f'{gs_dt.best_estimator_.score(X_tst,y_tst)} test accuracy')

Fitting 3 folds for each of 50 candidates, totalling 150 fits
DecisionTreeClassifier(max_depth=5, min_samples_leaf=5, random_state=0)
0.9623115577889447 train accuracy
0.9415204678362573 test accuracy


## Random Forest Classification

In [34]:
# Test a few different hyperparameters for Random Forest: max_depth, n_estimators
dep_vals_rf = [5, 10, 25, 50, 100]
est_vals_rf = [1,5,25,50,100,250,500]

In [35]:
# Train different RandomForest models, using grid search and cross validation to find best hyperparameters.
rf = sklearn.ensemble.RandomForestClassifier(random_state=0)
param_grid={'max_depth': dep_vals_rf, 'n_estimators' : est_vals_rf}
gs_rf = sklearn.model_selection.GridSearchCV(rf,param_grid,verbose=1,cv=3).fit(X_trn,y_trn)
print(gs_rf.best_estimator_)
print(f'{gs_rf.best_estimator_.score(X_trn,y_trn)} train accuracy') 
print(f'{gs_rf.best_estimator_.score(X_tst,y_tst)} test accuracy')

Fitting 3 folds for each of 35 candidates, totalling 105 fits
RandomForestClassifier(max_depth=10, n_estimators=250, random_state=0)
1.0 train accuracy
0.9707602339181286 test accuracy


## K-nearest Neighbours Classification

In [36]:
# Test a few different hyperparameters for K-nearest neighbours: n_neighbors, weights, algorithm
n_vals_k = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]
w_vals_k = ['uniform', 'distance']
m_vals_k = ['euclidean', 'manhattan', 'minkowski']

In [37]:
# Train different K-neighbors models, using grid search and cross validation to find best hyperparameters.
kn = sklearn.neighbors.KNeighborsClassifier()
param_grid={'n_neighbors': n_vals_k, 'weights' : w_vals_k, 'metric' : m_vals_k}
gs_kn = sklearn.model_selection.GridSearchCV(kn,param_grid,verbose=1,cv=3).fit(X_trn,y_trn)
print(gs_kn.best_estimator_)
print(f'{gs_kn.best_estimator_.score(X_trn,y_trn)} train accuracy') 
print(f'{gs_kn.best_estimator_.score(X_tst,y_tst)} test accuracy')

Fitting 3 folds for each of 90 candidates, totalling 270 fits
KNeighborsClassifier(metric='manhattan', n_neighbors=7)
0.9698492462311558 train accuracy
0.9532163742690059 test accuracy


## AdaBoost Classification

In [38]:
# Test a few different hyperparameters for AdaBoost: n_estimators, learning_rate, algorithm
est_vals_ada = [1,5,10,25,50,80,85,100]
learn_vals_ada = [0.1, 0.5, 1.0, 1.5, 2.0]
a_vals_ada = ['SAMME', 'SAMME.R',]

In [39]:
# Train different AdaBoost models, using grid search and cross validation to find best hyperparameters.
ada = sklearn.ensemble.AdaBoostClassifier(random_state=0)
param_grid={'n_estimators': est_vals_ada, 'learning_rate' : learn_vals_ada, 'algorithm' : a_vals_ada}
gs_ada = sklearn.model_selection.GridSearchCV(ada,param_grid,verbose=1,cv=3).fit(X_trn,y_trn)
print(gs_ada.best_estimator_)
print(f'{gs_ada.best_estimator_.score(X_trn,y_trn)} train accuracy') 
print(f'{gs_ada.best_estimator_.score(X_tst,y_tst)} test accuracy')

Fitting 3 folds for each of 80 candidates, totalling 240 fits
AdaBoostClassifier(learning_rate=1.5, random_state=0)
1.0 train accuracy
0.9824561403508771 test accuracy


## Gaussian Naive Bayes Classification

## Neural Network Classification

In [40]:
# Test a few different hyperparameters for Neural Networks: hidden_layer_sizes, activation, solver
hl_vals = [(),(10,),(50,),(100,),(10,10),(50,50),(100,50),(100,100)]
solv_vals = ['sgd','adam']

In [41]:
# Train different Neural Network models, using grid search and cross validation to find best hyperparameters.
nn = sklearn.neural_network.MLPClassifier(activation='relu',max_iter=1000,batch_size=100,learning_rate_init=0.01,random_state=0)
param_grid={'hidden_layer_sizes' : hl_vals, 'solver' : solv_vals}
gs_nn = sklearn.model_selection.GridSearchCV(nn,param_grid,verbose=1,cv=3).fit(X_trn,y_trn)
print(gs_nn.best_estimator_)
print(f'{gs_nn.best_estimator_.score(X_trn,y_trn)} train accuracy') 
print(f'{gs_nn.best_estimator_.score(X_tst,y_tst)} test accuracy')

Fitting 3 folds for each of 16 candidates, totalling 48 fits
MLPClassifier(batch_size=100, hidden_layer_sizes=(), learning_rate_init=0.01,
              max_iter=1000, random_state=0)
0.9899497487437185 train accuracy
0.9473684210526315 test accuracy
