In [1]:
import numpy as np
import pandas as pd
from scipy import stats

In [2]:
fileName = 'selectedGenes/selectedGene_frequency.csv'
selectedGenes = pd.read_csv(fileName, header = None).values.flatten()
print(selectedGenes)

['FOXO1' 'IQGAP1' 'GLCE']


In [3]:
dataPath = 'data/sampleDataset.csv'
df = pd.read_csv(dataPath)
df.head(3)

Unnamed: 0,Label,IQGAP1,FOXO1,CDC42EP4,BE501966,AF130091,CRISPLD2,HOXD11,C15orf59,GLCE,...,EPCAM,PLCL1,PA2G4,MAMDC2,LOC441204,AI652919,LGALS3,LINC00665,SLIT1,AKIP1
0,1,1390.707367,232.596033,391.878533,1459.72,113.274,1050.67715,38.9582,30.03575,223.72,...,2513.31,55.754,628.225,95.9092,56.441375,32.6741,545.386,189.880288,123.0753,182.5157
1,1,1698.328333,243.188433,436.153733,790.416,152.258,1471.971,25.853,22.785,295.9,...,2652.9,90.26735,487.815,78.958,30.65075,15.5884,1122.09215,124.043707,58.3115,153.99935
2,1,1538.764633,196.422933,406.8924,842.691,136.974,983.0693,28.8775,27.96905,328.334,...,3425.47,40.9415,582.937,77.8575,64.9531,25.0948,632.1472,174.704315,93.50215,198.0187


In [4]:
X = df.loc[:,selectedGenes].values
X = stats.zscore(X, axis = 0)
y = df.values[:,0]+1
print(X.shape)
print(y.shape, y)

(21, 3)
(21,) [2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 1. 1. 1. 1. 1. 1. 1. 1.]


In [5]:
def permanceMetrics(Y, pred, pred_prob):
    acc = accuracy_score(Y,pred)
    acrc = roc_auc_score(Y,pred_prob[:,1])
    return np.array([acc, acrc])

In [6]:
def saveres(sr, rr):
    b = np.append(sr.reshape(1,-1), rr.reshape(1,-1), axis=0)
    np.savetxt('validationResult.csv', b, delimiter=',', fmt='%s')

In [7]:
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.svm import SVC
loocv = model_selection.LeaveOneOut()

SEED = 22
n_tree = 300

ts = []
ps = []
preds = []

prd =[]
predd = []

pk = []
predk = []

pr = []
predr = []
print(X.shape)
for train_index, test_index in loocv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = y[train_index], y[test_index]
    ts+=[Y_test]
    
    svm = SVC(kernel='linear', probability=True)
    rf = RF(n_estimators = n_tree, max_depth = 5, criterion='entropy', random_state = SEED)
    
    svm.fit(X_train,Y_train)
    pred = svm.predict(X_test)
    pred_prob = svm.predict_proba(X_test)
    ps+=[pred]
    preds+=[pred_prob]
    
    rf.fit(X_train,Y_train)
    pred = rf.predict(X_test)
    pred_prob = rf.predict_proba(X_test)
    pr+=[pred]
    predr+=[pred_prob]

t = np.array(ts)
pre_prob = np.asarray(preds).reshape(-1,2)
pre_d = np.asarray(predd).reshape(-1,2)
pre_k = np.asarray(predk).reshape(-1,2)
pre_r = np.asarray(predr).reshape(-1,2)

sr = permanceMetrics(t, ps, pre_prob)
rr = permanceMetrics(t, pr, pre_r)
saveres(sr, rr)

print('SVM: Acc. AUROC')
print(sr)
print('RF: Acc. AUROC')
print(rr)

(21, 3)
SVM: Acc. AUROC
[0.95238095 1.        ]
RF: Acc. AUROC
[1. 1.]


#K-FOLD crosss validation

In [8]:
def permanceMetrics(Y, pred, pred_prob):
    acc = accuracy_score(Y,pred)
    acrc = roc_auc_score(Y,pred_prob[:,1])
    return acc, acrc

In [9]:
def avgMetric(met):
    res = np.array(met)
    acc = res[::2].mean()
    acrc = res[1::2].mean()
    return np.array([acc, acrc])

In [10]:
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.svm import SVC

import warnings
warnings.filterwarnings('ignore')

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
SEED = 22
n_tree = 300

sm = []
dm = []
km = []
rm = []

print(X.shape)
for train_index, test_index in kfold.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = y[train_index], y[test_index]
    ts+=[Y_test]
    
    svm = SVC(kernel='linear', probability=True)
    rf = RF(n_estimators = n_tree, max_depth = 5, criterion='entropy', random_state = SEED)
    
    svm.fit(X_train,Y_train)
    pred = svm.predict(X_test)
    pred_prob = svm.predict_proba(X_test)    
    acc, acrc = permanceMetrics(Y_test, pred, pred_prob)
    sm+= [acc, acrc]    
    rf.fit(X_train,Y_train)
    pred = rf.predict(X_test)
    pred_prob = rf.predict_proba(X_test)
    acc, acrc = permanceMetrics(Y_test, pred, pred_prob)
    rm+= [acc, acrc]
    
print('SVM: Acc. AUROC')
print(avgMetric(sm))
print('RF: Acc. AUROC')
print(avgMetric(rm))

saveres(avgMetric(sm), avgMetric(rm))

(21, 3)
SVM: Acc. AUROC
[0.96 1.  ]
RF: Acc. AUROC
[1. 1.]
