In [1]:
from sklearn.model_selection import train_test_split
import csv
import numpy as np

from classifiers import *
from utils import zScoreNormalization

samples = []
results = []

print('Reading dataset...')
with open('../data/Final_GSE25097_Matrix.csv') as file:
    csv_reader = csv.reader(file)
    i = 0
    for row in csv_reader:
        if i == 2:
            sample_labels = row[1:]
        if i > 5:
            samples.append([float(x) for x in row[3:]])
        i += 1

# samples = zScoreNormalization(samples)
samples = np.array(samples).T
print('Total Samples:', samples.shape[0])
print('Total Features:', samples.shape[1])

pos_samples = 0
for i in range(samples.shape[0]):
    if sample_labels[i] in ['normal', 'adjacent-non_tumor']:
        sample_labels[i] = 0
    else:
        sample_labels[i] = 1
        pos_samples += 1
print('Positive Samples:', pos_samples)
print('Negative Samples:', samples.shape[0]-pos_samples)

# 70% training and 30% test
print('Splitting dataset into 70% training and 30% test..')
X_train, X_test, y_train, y_test = train_test_split(samples, sample_labels, test_size=0.3, random_state=109)

Reading dataset...
Total Samples: 557
Total Features: 37582
Positive Samples: 308
Negative Samples: 249
Splitting dataset into 70% training and 30% test..
CPU times: user 6.45 s, sys: 1.79 s, total: 8.25 s
Wall time: 6.58 s


1. No feature selection

In [2]:
%%time
classify(X_train, X_test, y_train, y_test, results=results)


╒══════════════════════════════╤══════════╕
│ SVM                          │ 0.988095 │
├──────────────────────────────┼──────────┤
│ Decision Tree                │ 0.934524 │
├──────────────────────────────┼──────────┤
│ Extra Trees                  │ 0.964286 │
├──────────────────────────────┼──────────┤
│ Linear Discriminant Analysis │ 0.970238 │
├──────────────────────────────┼──────────┤
│ kNN                          │ 0.934524 │
├──────────────────────────────┼──────────┤
│ Gaussian Naive Bayes         │ 0.922619 │
├──────────────────────────────┼──────────┤
│ Multi-layer Perceptron       │ 0.982143 │
├──────────────────────────────┼──────────┤
│ Random Forest                │ 0.958333 │
╘══════════════════════════════╧══════════╛
CPU times: user 42.2 s, sys: 18.5 s, total: 1min
Wall time: 13.5 s


2. mRMR

In [3]:
%%time
from skfeature.function.information_theoretical_based import MRMR

num_fea = 100
idx, _, _ = MRMR.mrmr(samples, sample_labels, n_selected_features=num_fea)
_X_train = X_train[:, idx[0:num_fea]]
_X_test = X_test[:, idx[0:num_fea]]

print('=> Features Selected:', len(idx))
classify(_X_train, _X_test, y_train, y_test, results=results, fs='mRMR')

KeyboardInterrupt: 

3. Mutual Information (MIFS)

In [4]:
%%time
from skfeature.function.information_theoretical_based import MIFS

num_fea = 100
idx, _, _ = MIFS.mifs(samples, sample_labels, n_selected_features=num_fea)
_X_train = X_train[:, idx[0:num_fea]]
_X_test = X_test[:, idx[0:num_fea]]

print('=> Features Selected:', len(idx))
classify(_X_train, _X_test, y_train, y_test, results=results, fs='Mutual Information')

KeyboardInterrupt: 

4. Interaction Capping (ICAP)

In [5]:
%%time
from skfeature.function.information_theoretical_based import ICAP

num_fea = 100
idx, _, _ = ICAP.icap(samples, sample_labels, n_selected_features=num_fea)
_X_train = X_train[:, idx[0:num_fea]]
_X_test = X_test[:, idx[0:num_fea]]

print('=> Features Selected:', len(idx))
classify(_X_train, _X_test, y_train, y_test, results=results, fs='Interaction Capping')

KeyboardInterrupt: 

5. Joint Mutual Information (JMI)

In [6]:
%%time
from skfeature.function.information_theoretical_based import JMI

num_fea = 100
idx, _, _ = JMI.jmi(samples, sample_labels, n_selected_features=num_fea)
_X_train = X_train[:, idx[0:num_fea]]
_X_test = X_test[:, idx[0:num_fea]]

print('=> Features Selected:', len(idx))
classify(_X_train, _X_test, y_train, y_test, results=results, fs='Joint Mutual Information')


KeyboardInterrupt



6. Binary BAT Algorithm (BBA)

In [None]:
%%time
from Py_FS.wrapper.nature_inspired import BBA

sol = BBA(150, 10, samples, sample_labels, weight_acc=0.7, val_size=30)
idx = []
for i in range(len(sol.best_agent)):
    if sol.best_agent[i] == 1:
        idx.append(i)
_X_train = X_train[:, idx[0:num_fea]]
_X_test = X_test[:, idx[0:num_fea]]

print('=> Features Selected:', len(idx))
classify(_X_train, _X_test, y_train, y_test, results=results, fs='Binary BAT Algorithm')

7. Mayfly Algorithm (MA)

In [None]:
%%time
from Py_FS.wrapper.nature_inspired import MA

sol = MA(150, 10, samples, sample_labels, weight_acc=0.7, val_size=30)
idx = []
for i in range(len(sol.best_agent)):
    if sol.best_agent[i] == 1:
        idx.append(i)
_X_train = X_train[:, idx[0:num_fea]]
_X_test = X_test[:, idx[0:num_fea]]

print('=> Features Selected:', len(idx))
classify(_X_train, _X_test, y_train, y_test, results=results, fs='Mafly Algorithm')

8. Particle Swarm Optimization (PSO)

In [None]:
%%time
from Py_FS.wrapper.nature_inspired import PSO

sol = PSO(150, 10, samples, sample_labels, weight_acc=0.7, val_size=30)
idx = []
for i in range(len(sol.best_agent)):
    if sol.best_agent[i] == 1:
        idx.append(i)
_X_train = X_train[:, idx[0:num_fea]]
_X_test = X_test[:, idx[0:num_fea]]

print('=> Features Selected:', len(idx))
classify(_X_train, _X_test, y_train, y_test, results=results, fs='Particle Swarm Optimization')

9. Grey Wolf Optimizer (GWO)

In [None]:
%%time
from Py_FS.wrapper.nature_inspired import GWO

sol = GWO(150, 10, samples, sample_labels, weight_acc=0.7, val_size=30)
idx = []
for i in range(len(sol.best_agent)):
    if sol.best_agent[i] == 1:
        idx.append(i)
_X_train = X_train[:, idx[0:num_fea]]
_X_test = X_test[:, idx[0:num_fea]]

print('=> Features Selected:', len(idx))
classify(_X_train, _X_test, y_train, y_test, results=results, fs='Grey Wolf Optimiser')

10. Harmony Search (HS)
%%time

In [None]:
from Py_FS.wrapper.nature_inspired import HS

sol = HS(150, 10, samples, sample_labels, weight_acc=0.7, val_size=30)
idx = []
for i in range(len(sol.best_agent)):
    if sol.best_agent[i] == 1:
        idx.append(i)
_X_train = X_train[:, idx[0:num_fea]]
_X_test = X_test[:, idx[0:num_fea]]

print('=> Features Selected:', len(idx))
classify(_X_train, _X_test, y_train, y_test, results=results, fs='Harmony Search')

### Final Result

In [None]:
print(tabulate(
    results,
    tablefmt='fancy_grid',
    headers=[
        'Feature Selection',
        'SVM',
        'Decision Tree',
        'Extra Trees',
        'Linear DA',
        'kNN',
        'GaussianNB',
        'MLP',
        'Random Forest'
    ]
))