In [1]:
import numpy as np
import pandas as pd
import scipy.io

In [2]:
df = pd.read_csv('data/sampleDataset.csv')
print(df.shape)

mat = scipy.io.loadmat('data/sampleDataset.mat')
res = mat['aa']
data = res.reshape(-1,1)
print(data.shape)
print(data[:2,:])

(21, 111)
(10, 1)
[[array([[2.        , 9.        ],
       [9.        , 2.        ],
       [0.91148636, 0.83555504]])]
 [array([[1.        , 2.        ],
       [9.        , 2.        ],
       [0.87822097, 0.79807125]])]]


In [3]:
idxes = []
for fold in data:
    for idxs in fold:
        for idx in idxs[0]:
            idxes.append(int(idx))
final = np.asarray(idxes)
print(final.shape)
print(final)

(20,)
[2 9 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2]


In [4]:
from collections import Counter
cnt = Counter(final)
mfi = np.array(cnt.most_common())
print(mfi)
genes = df.columns[mfi[:,0]]
print(genes)

[[ 2 10]
 [ 1  9]
 [ 9  1]]
Index(['FOXO1', 'IQGAP1', 'GLCE'], dtype='object')


In [5]:
# Gene Selection based on Frequency
selectedGene = np.array(df.columns[mfi[:,0]])
np.savetxt('selectedGenes/selectedGene_frequency.csv',selectedGene, delimiter=',', fmt='%s')

In [6]:
# Gene Selection based on RF feature importance
X = df.iloc[:,mfi[:,0]].values
y = df.values[:,0]
print(X.shape)
print(y.shape, y)

(21, 3)
(21,) [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]


In [7]:
from sklearn.ensemble import RandomForestClassifier as RF
import numpy as np
SEED = 0
np.random.seed(SEED)
n_tree = 300
rf = RF(n_estimators = n_tree, max_depth = 5, criterion='entropy', random_state = SEED)
def get_vectors(X, y, rf):
    rf.fit(X,y)
    trees = [tree.tree_ for tree in rf.estimators_]
    leaves = rf.apply(X)
    N = X.shape[0]
    n_classes = len(np.unique(y))
    vectors = []
    for ind, tree in enumerate(trees):
        values = tree.value[leaves[:,ind]][:,0,:]
        maxes = np.max(values, axis=1).reshape((N,1))
        vec = 1*(values==maxes)
        vectors += [vec]
    vectors = np.array(vectors).transpose(1,0,2)
    feature_importance = [tree.feature_importances_.T for tree in rf.estimators_]
    feature_importance = np.array(feature_importance)
    return vectors, feature_importance

def get_output_vectors(X, y, rf):
    trees = [tree.tree_ for tree in rf.estimators_]
    leaves = rf.apply(X)
    N = X.shape[0]
    n_classes = len(np.unique(y))
    vectors = []
    for ind, tree in enumerate(trees):
        values = tree.value[leaves[:,ind]][:,0,:]
        maxes = np.max(values, axis=1).reshape((N,1))
        vec = 1*(values==maxes)
        vectors += [vec]
    vectors = np.array(vectors).transpose(1,0,2)
    feature_importance = [tree.feature_importances_.T for tree in rf.estimators_]
    feature_importance = np.array(feature_importance)
    return vectors, feature_importance  

In [8]:
rf = RF(n_estimators = n_tree, max_depth = 5, criterion='entropy', random_state = SEED)
vector, featureImportance = get_vectors(X,y,rf)

In [9]:
a = featureImportance
fs = np.sum(a, axis =0)
c = sorted(fs, reverse=True)
b = np.argsort(fs)
f = np.flip(genes[b[0:101]])
np.savetxt("selectedGenes/selectedGene_RF.csv", f, delimiter=",", fmt='%s')