In [1]:
import numpy as np
import pandas as pd
import umap
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split
import sys
from multiprocessing import Manager, Lock, Pool, cpu_count
import time

In [2]:
def progress(count, total, status=''):
    
    bar_len = 40
    filled_len = int(round(bar_len * count / float(total)))

    percents = round(100.0 * count / float(total), 1)
    bar = '█' * filled_len + '░' * (bar_len - filled_len)

    sys.stdout.write(f'\r|{bar}| {percents}% ... {status}')
    sys.stdout.flush()

In [3]:
INDEX_COLUMN_NAME = 'sample'
TUMOR_COLUMN_NAME = 'tumor'
HGG_ROW_VALUE = 'hgg'
LGG_ROW_VALUE = 'lgg'
HGG_CSV = '/path/to/file.csv'        #file given by RadiomicsFeatures (HGG)
LGG_CSV = '/path/to/file.csv'        #file given by RadiomicsFeatures (LGG)

In [4]:
hgg = pd.read_csv(filepath_or_buffer=HGG_CSV)   #read the file given
lgg = pd.read_csv(filepath_or_buffer=LGG_CSV)
hgg = hgg.set_index(['sample'])
lgg = lgg.set_index(['sample'])
surv_hgg = hgg.loc[:, 'surv']
surv_lgg = lgg.loc[:, 'surv']
tot_cols = len(list(hgg))
hgg = hgg.iloc[:, 1:(tot_cols-2)]
lgg = lgg.iloc[:, 1:(tot_cols-2)]

In [5]:
tumor_hgg = ['hgg' for _ in range(hgg.shape[0])]
tumor_lgg = ['lgg' for _ in range(lgg.shape[0])]
hgg = hgg.assign(tumor=tumor_hgg)
lgg = lgg.assign(tumor=tumor_lgg)

In [6]:
#return dimentions of data givedn by both HGG and LGG (concatenated)
alldata = pd.concat([hgg,lgg])
alldata = alldata.replace(np.inf, np.nan)  #replace inf values (divided by zero) with NaN
alldata = alldata.dropna(axis=1)
print(alldata.shape)

(267, 5521)


In [8]:
#calculate Matthews Correlation Coefficent
def calc_mcc(exps):
    for exp in exps:
        x_train, x_test, y_train, y_test = train_test_split(alldata.iloc[:, :alldata.shape[1]-1], alldata['tumor'])
        x_train, x_test = (x_train - x_train.mean(axis=0)) / x_train.var(axis=0), (x_test - x_train.mean(axis=0)) / x_test.var(axis=0)
        classifier = LinearSVC(C=0.001)
        classifier.fit(x_train, y_train)
        y_pred = classifier.predict(x_test)
        with Lock():
            mccs_man.append(matthews_corrcoef(y_test, y_pred))
            count.value += 1
            progress(count.value, num_exps)

In [9]:
#Function to segment the array in order to utilize multiprocessing
def segment(array, parts):
    
    avg = len(array) / parts
    last = 0.0

    while last < len(array):
        yield array[int(last):int(last + avg)]
        last += avg

In [10]:
num_exps = 2000
test_size = 0.2

with Manager() as manager:
    # List of all features
    mccs_man = manager.list()
    count = manager.Value('i', 0)
    
    t0 = time.time()

    with Pool(cpu_count()) as pool:
        tasks = [pool.apply_async(calc_mcc, args=(part,)) for part in segment([*range(num_exps)], cpu_count())]
        
        [task.get() for task in tasks]

        mccs = [x for x in mccs_man]

    print(f' ... {round(time.time() - t0, 2)}s')

pd.DataFrame(mccs).describe()

  1%|          | 17/2000 [00:24<55:38,  1.68s/it]Process ForkPoolWorker-11:
Process ForkPoolWorker-24:
Process ForkPoolWorker-19:
Process ForkPoolWorker-5:


KeyboardInterrupt: 