In [1]:
import os
import cv2
import glob
import shutil
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


# neural imaging
import nilearn as nl
import nibabel as nib
import nilearn.plotting as nlplt
# !pip install git+https://github.com/miykael/gif_your_nifti # nifti to gif 
# import gif_your_nifti.core as gif2nif


# ml libs
import keras
import keras.backend as K
from keras.callbacks import CSVLogger
from sklearn.metrics import log_loss
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression as MIR
import tensorflow as tf
# from tensorflow.keras.models import *
# from tensorflow.keras.layers import *
# from tensorflow.keras.optimizers import *
# from tensorflow.keras.utils import plot_model, to_categorical   
# from tensorflow.keras.layers.experimental import preprocessing
# from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping, TensorBoard


# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)


2024-01-15 19:06:19.452467: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/surajit/anaconda3/lib/python3.9/site-packages/cv2/../../lib64:
2024-01-15 19:06:19.452488: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# DEFINE seg-areas  
SEGMENT_CLASSES = {
    0 : 'NOT tumor',
    1 : 'NECROTIC/CORE', 
    2 : 'EDEMA',
    3 : 'ENHANCING' # original 4 -> converted into 3 later
}

# # days start interval
# SURVIVAL_CATEGORIES= {
#     'SHORT' : 0 , # 0-300
#     'MEDIUM' : 250,  # 300-450
#     'LONG' : 450, # 450 and more
# }

# there are 155 slices per volume
# to start at 5 and use 145 slices means we will skip the first 5 and last 5 
VOLUME_SLICES = 100 
VOLUME_START_AT = 22 # first slice of volume that we will include
IMG_SIZE=128
TRAIN_DATASET_PATH='./MICCAI_BraTS2020_TrainingData/'
VAL_DATASET_PATH='./MICCAI_BraTS2020_ValidationData/'

In [3]:
# lists of directories with studies
train_and_val_directories = [f.path for f in os.scandir(TRAIN_DATASET_PATH) if f.is_dir()]
train_and_val_directories+= [f.path for f in os.scandir(VAL_DATASET_PATH) if f.is_dir()]
# file BraTS20_Training_355 has ill formatted name for for seg.nii file
train_and_val_directories.remove(TRAIN_DATASET_PATH+'BraTS20_Training_355')


def pathListIntoIds(dirList):
    x = []
    for i in range(0,len(dirList)):
        x.append(dirList[i][dirList[i].rfind('/')+1:])
    return x

train_and_test_ids = pathListIntoIds(train_and_val_directories)

In [4]:
print(train_and_val_directories)

['./MICCAI_BraTS2020_TrainingData/BraTS20_Training_168', './MICCAI_BraTS2020_TrainingData/BraTS20_Training_051', './MICCAI_BraTS2020_TrainingData/BraTS20_Training_036', './MICCAI_BraTS2020_TrainingData/BraTS20_Training_322', './MICCAI_BraTS2020_TrainingData/BraTS20_Training_327', './MICCAI_BraTS2020_TrainingData/BraTS20_Training_049', './MICCAI_BraTS2020_TrainingData/BraTS20_Training_270', './MICCAI_BraTS2020_TrainingData/BraTS20_Training_109', './MICCAI_BraTS2020_TrainingData/BraTS20_Training_228', './MICCAI_BraTS2020_TrainingData/BraTS20_Training_297', './MICCAI_BraTS2020_TrainingData/BraTS20_Training_069', './MICCAI_BraTS2020_TrainingData/BraTS20_Training_095', './MICCAI_BraTS2020_TrainingData/BraTS20_Training_363', './MICCAI_BraTS2020_TrainingData/BraTS20_Training_234', './MICCAI_BraTS2020_TrainingData/BraTS20_Training_369', './MICCAI_BraTS2020_TrainingData/BraTS20_Training_082', './MICCAI_BraTS2020_TrainingData/BraTS20_Training_204', './MICCAI_BraTS2020_TrainingData/BraTS20_Traini

In [5]:
import csv

csvs = [r'MICCAI_BraTS2020_TrainingData\survival_info.csv', r'MICCAI_BraTS2020_ValidationData\survival_evaluation.csv']

age_dict = {}
days_dict = {}
at_line = 0
category_short = 0
category_medium = 0
category_long = 0
max_days = 0
for csv_path in csvs:
    with open(csv_path, mode='r') as csv_file:
        csv_reader = csv.reader(csv_file,delimiter = ',')
    #  row_count = sum(1 for row in csv_reader)
    #   print(f'total rows: {row_count} .')
        for row in csv_reader:
            if at_line == 0:
                # print(f'Column names are {", ".join(row)}')
                at_line += 1
            else:
                if (row[-1] != "GTR"):
                    continue
                # print(row)
                key = row[0]
                age = row[1]
                days = row[2]
                age_dict[key] = float(age)
                if (not days.isnumeric()):
                    continue
                days_dict[key] = int(days)
                max_days = max(max_days,int(days))
                if int(days) < 250:
                    category_short += 1
                elif (int(days) >= 250 and int(days) <= 450):
                    category_medium += 1
                else:
                    category_long += 1
                at_line+=1

    print(f'Processed {at_line} lines.')
    print(category_short,category_medium,category_long)
    print(max_days)

FileNotFoundError: [Errno 2] No such file or directory: 'MICCAI_BraTS2020_TrainingData\\survival_info.csv'

In [None]:
import csv

def col_and_eval(ch_ft = []):
    id_list2 = []
    csv_filenames = [r'.\results.csv',r'.\results_val.csv']
    dt =[]
    jj = [] #key_list
    id_tp = 0
    row_cc = 0
    row2 = {}
    id_check = {}
    for csv_filename in csv_filenames:
        with open(csv_filename) as f:
            reader = csv.DictReader(f)
            for row in reader:
                # if row_cc > 20:
                #     break
                # row_cc += 1
                dt_tp = row
                if row['Patient_ID'] not in id_check:
                    id_check[row['Patient_ID']] = 0
                id_check[row['Patient_ID']] += 1
                if len(id_list2) != 0:
                    if id_list2[-1] == row["Patient_ID"]:
                        id_tp += 1
                    else:
                        dt.append(row2)
                        row2 = {}
                        id_tp = 0
                if id_tp == 0:
                    id_list2.append(row["Patient_ID"])
                if len(id_list2) == 1:
                    for keysj in row:
                        if keysj != None:
                            if 'diagnostic' in keysj or 'Modality' in keysj or 'Patient_ID' in keysj:
                                continue
                            # print(type(keysj),keysj, type(row[keysj]),row[keysj])
                            row[keysj] = eval(row[keysj])
                            if isinstance(row[keysj],tuple):
                                ct = 0
                                for it in row[keysj]:
                                    keysjn = keysj + str(ct)
                                    keysjn += row["Modality"][row["Modality"].find('_',19,-1):row["Modality"].find('.')]
                                    if (keysjn in ch_ft) or (len(ch_ft) == 0):
                                        jj.append(keysjn)
                                    ct += 1
                                continue
                            keysj += row["Modality"][row["Modality"].find('_',19,-1):row["Modality"].find('.')]
                            if (keysj in ch_ft) or (len(ch_ft) == 0):
                                jj.append(keysj)
                for keysj in row:
                    if keysj != None:
                        if 'diagnostic' in keysj or 'Modality' in keysj or 'Patient_ID' in keysj:
                            continue
                        # print(type(keysj),keysj, type(row[keysj]),row[keysj])
                        row[keysj] = eval(str(row[keysj]))
                        keys2j = keysj + row["Modality"][row["Modality"].find('_',19,-1):row["Modality"].find('.')]
                        if isinstance(row[keysj],tuple):
                            ct = 0
                            for it in row[keysj]:
                                keys2j =  keysj + str(ct) + row["Modality"][row["Modality"].find('_',19,-1):row["Modality"].find('.')]
                                row2[keys2j] = it
                                ct += 1
                            continue
                        row2[keys2j] = row[keysj]
                row2['Patient_ID'] = row['Patient_ID']
    dt.append(row2)    
    return dt, jj

# print(jj)
# print(dt)
# create only age: category data


In [None]:
# id: age, categories
def getListAgeDays(id_list, dt, jj, ch_ft = []):
    x_val = []
    y_val = []
    x_test = []
    for i in dt:
        if ((i["Patient_ID"] not in days_dict) and ("Validation" not in i["Patient_ID"])):
            # print(i["Patient_ID"])
            continue
        # if (i not in id_list2):
        #     continue
        # masks = getMaskSizesForVolume(nib.load(TRAIN_DATASET_PATH + f'BraTS20_Training_{i[-3:]}/BraTS20_Training_{i[-3:]}_seg.nii.gz').get_fdata())
        # brain_vol = getBrainSizeForVolume(nib.load(TRAIN_DATASET_PATH + f'BraTS20_Training_{i[-3:]}/BraTS20_Training_{i[-3:]}_t1.nii.gz').get_fdata())
        # masks[1] = masks[1]/brain_vol
        # masks[2] = masks[2]/brain_vol
        # masks[3] = masks[3]/brain_vol
        # merged=[age_dict[i],masks[1],masks[2],masks[3]] ## add segments
        # ft = dt[]
        if (i["Patient_ID"] not in age_dict):
            continue
        j = [age_dict[i["Patient_ID"]]]
        for keysj in i:
            if keysj != None:
                if 'diagnostic' in keysj or 'Modality' in keysj or 'Patient_ID' in keysj:
                    continue
                if (keysj not in ch_ft) and (len(ch_ft) != 0):
                    continue
                # if(i[keysj][0]=='('):
                #     i[keysj] = tuple(i[keysj])
                # else:
                # print(i[keysj])
                # i[keysj] = eval(i[keysj])
                # if isinstance(i[keysj],tuple):
                #     ct = 0
                #     for it in i[keysj]:
                #         keysjn = keysj + str(ct)
                #         j.append(it)
                #         ct += 1
                #     continue
                j.append(i[keysj])
        # print(j)
        if "Validation" in i["Patient_ID"]:
            # print("vvv"+i["Patient_ID"])
            x_test.append(j)
        else:
            x_val.append(j) 
            y_val.append(days_dict[i["Patient_ID"]])
        # if (days_dict[i["Patient_ID"]] < 250):
        #     y_val.append(0)
        # elif (days_dict[i["Patient_ID"]] >= 250 and days_dict[i["Patient_ID"]] < 450):
        #     y_val.append(1)
        # else:
        #     y_val.append(2)
            
    return np.array(x_val), np.array(y_val), np.array(x_test)

dt, jj = col_and_eval()
X_all, y_all, x_val = getListAgeDays(train_and_test_ids,dt,jj)
print(len(train_and_test_ids))
print(X_all.shape)
print(y_all.shape)
print(x_val.shape)
print(f'X_all: {X_all.shape}')
columns = ["age"]+jj+["prediction"]
print(columns)
dfn = np.concatenate((X_all, y_all[:,None]), axis=1)
print(dfn.shape,len(columns))
df = pd.DataFrame(dfn, columns = columns)
df2 = pd.DataFrame(x_val,columns = ["age"]+jj)
df2.head()

In [None]:
scaler = MinMaxScaler()
v = X_all
v_scaled = scaler.fit_transform(v)
X_all = v_scaled
X_val = scaler.fit_transform(x_val)

df = pd.DataFrame(X_all, columns = ["age"]+jj)
# display(df)
df2 = pd.DataFrame(X_val,columns = ["age"]+jj)
df2.head()

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X_all,y_all,test_size = 0.2, random_state = 42, shuffle = True)
X_train = X_all
y_train = y_all
print("x_train shape:",X_train.shape)
# print("x_test shape:", X_test.shape)
print("y_train shape:",y_train.shape)
# print("y_test shape:", y_test.shape)

In [None]:
from skfeature.function.similarity_based import fisher_score

print(X_train.shape,y_train.shape)
_ranks = fisher_score.fisher_score(X_train,y_train)

In [None]:
# ft_sc = {}
# ii = 0
# columns = ["age"]+jj
# for it in columns[:-1]:
#     ft_sc[it] = _ranks[ii]
#     ii += 1

# sorted_ft_sc = sorted(ft_sc.items(), key=lambda x:x[1], reverse=True)
# sorted_ft_sc = dict(sorted_ft_sc)
# print(sorted_ft_sc)
# print(_ranks)
# print(columns[_ranks[0]])

In [None]:
# ij = 0
# chosen_ft = {}
# for it in sorted_ft_sc:
#     chosen_ft[it] = sorted_ft_sc[it]
#     ij += 1
#     if ij>=400:
#         break

# print(chosen_ft)

In [None]:
columns = ["age"]+jj
colnew = []
def prune2(X):
    xx = []
    for row in X:
        iter = 0
        xxtp = []
        for col in row:
            if _ranks[iter]<400:
                xxtp.append(col)
                if columns[iter] not in colnew:
                    colnew.append(columns[iter])
            iter += 1
        xx.append(xxtp)
    return xx

x_train = np.array(prune2(X_train))
x_test = np.array(prune2(X_val))

print(x_train.shape)
print(x_test.shape)
df = pd.DataFrame(x_train, columns = colnew)
df2 = pd.DataFrame(x_test,columns = colnew)
# df2.head()
df.head()

In [None]:
# dt, jj = col_and_eval(chosen_ft)

In [None]:
# ch_X_all, ch_y_all, ch_x_test = getListAgeDays(train_and_test_ids,dt,jj,ch_ft=chosen_ft)
# print(len(train_and_test_ids))
# print(ch_X_all.shape)
# print(ch_x_test.shape)
# print(ch_y_all.shape)
# columns = ["age"]+jj+["prediction"]
# print(columns)
# ch_dfn = np.concatenate((ch_X_all, ch_y_all[:,None]), axis=1)
# print(ch_dfn.shape,len(columns))
# df = pd.DataFrame(ch_dfn, columns = columns)
# df2 = pd.DataFrame(ch_x_test,columns = ["age"]+jj)
# df2.head()
# df.head()

In [None]:
scaler = MinMaxScaler()
v = x_train
v_scaled = scaler.fit_transform(v)
x_train = v_scaled
v = x_test
v_scaled = scaler.fit_transform(v)
x_test = v_scaled

df = pd.DataFrame(x_train, columns = colnew)
display(df)

In [8]:
# X_train, X_test, y_train, y_test = train_test_split(X_all,y_all,test_size = 0.2, random_state = 42, shuffle = True)
# x_train = ch_X_all
# y_train = ch_y_all
# x_test = ch_x_test
print("x_train shape:",x_train.shape)
# print("x_test shape:", X_test.shape)
print("y_train shape:",y_train.shape)
# print("y_test shape:", y_test.shape)

NameError: name 'x_train' is not defined

**# mrmr implementation**

In [None]:
def rank(X,y,nbSolutions,I_xy):
    f_mir = []
    for i in range(int(X.shape[1])):
        f_mir.append([I_xy[i],i])
    f_mir.sort(reverse=True)
    S=[]
    si = []
    for i in range(nbSolutions):
        si.append(f_mir[i][1])
        S.append(si)
        si = []
    return S

In [None]:
def score(f,X,y,Si,I_xy,I_xx):
    if f == -1:
        return float("-INF")
    mi_temp = 0
    qj = I_xy[f]
    for xi in Si:
        mi_temp += I_xx[xi][f]
    mi_temp/=len(Si)
    qj += mi_temp
    return qj

In [None]:
#Mutual Implementation

I_xx = np.zeros((x_train.shape[1],x_train.shape[1]))
I_xy = np.zeros(x_train.shape[1])
print(I_xx.shape,I_xy.shape)

i1=0
for f1 in x_train.T:
    # print(res)
    # I_xx[i1][i2] = (-0.5)*math.log(1-(res*res))
    I_xx[i1] = MIR(x_train,list(f1))
    i1 += 1
    print(f"Processed MI with {i1} features so far...")

#each iteration taking ~30s and 10702 such iterations. thus, IMPOSSIBLE

In [None]:
I_xy = MIR(x_train,list(y_train))

In [None]:
S= rank(x_train,y_train,10,I_xy)

lenSol = 25
nbSol = 10
l = 1

while l<lenSol:
    i = 0
    # print(S)
    while i< nbSol:
        c = -1
        for f in range(int(x_train.shape[1])):
            if f in S[i]:
                continue
            if score(f,x_train,y_train,S[i],I_xy,I_xx)>=score(c,x_train,y_train,S[i],I_xy,I_xx):
                flag = 1
                for j in range(nbSol):
                    if j==i:
                        continue
                    if f not in S[j]:
                        flag = 0
                    for k in S[i]:
                        if k not in S[j]:
                            flag = 0
                if flag:
                    continue
                c = f
        S[i].append(c)
        i += 1
    l += 1
print(len(S),len(S[0]))


In [None]:
def prune(X,Si):
    xx = []
    for row in X:
        iter = 0
        xxtp = []
        for col in row:
            if iter in Si:
                xxtp.append(col)
            iter += 1
        xx.append(xxtp)
    return xx

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'bootstrap': bootstrap}
print(random_grid)

In [None]:
from sklearn.metrics import mean_squared_error
import statistics
from scipy import stats

def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    print(predictions)
    med_arr = [(yt-yp)*(yt-yp) for yt,yp in zip(predictions,test_labels)]
    return mean_squared_error(test_labels,predictions),statistics.median(med_arr),statistics.stdev(med_arr),stats.spearmanr(test_labels,predictions),predictions


In [None]:
def classif(y_train,y_pred):  
    y_train_v = []
    y_pred_v = []
    for yi in y_train:
        if (yi < 300):
            y_train_v.append(0)
        elif (yi >= 300 and yi < 450):
            y_train_v.append(1)
        else:
            y_train_v.append(2)

    for yi in y_pred:
        if (yi < 300):
            y_pred_v.append(0)
        elif (yi >= 300 and yi < 450):
            y_pred_v.append(1)
        else:
            y_pred_v.append(2)
    return y_train_v,y_pred_v

In [None]:
from sklearn.metrics import confusion_matrix

for Si in S:
    X_train_ = np.array(prune(x_train,Si))
    X_test_ = np.array(prune(x_test,Si))

    print(X_train_.shape)
    print(X_test_.shape)
    rf = RandomForestRegressor()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=3, random_state=42, n_jobs = -1)
    # Fit the random search model
    print(X_train_.shape)
    rf_random.fit(X_train_, y_train)
    rf_random.best_params_
    best_grid = rf_random.best_estimator_
    meanSE, medianSE, stdSE, spearmanR, y_pred = evaluate(best_grid, X_train_, y_train)
    print(meanSE, medianSE, stdSE, spearmanR)
    y_pred2=best_grid.predict(X_test_)
    print(y_pred2)
    for hh in y_pred2:
        print(hh)
    y_train_v, y_pred_v = classif(y_train,y_pred)
    cm = confusion_matrix(y_train_v, y_pred_v)

    sns.set(font_scale=1.2) 
    sns.heatmap(cm, annot=True, annot_kws={"size": 12}) 
    plt.show()


In [None]:
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import GridSearchCV
# from sklearn.feature_selection import RFE
# from sklearn.model_selection import cross_val_score
# from sklearn.metrics import accuracy_score

# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]
# # Create the random grid
# random_grid = {'n_estimators': n_estimators,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}
# print(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune


In [None]:
rf_random.best_params_


In [None]:
# # Create the parameter grid based on the results of random search 
# param_grid = {
#     'min_samples_leaf': [2, 4, 6],
#     'min_samples_split': [2, 5, 6],
#     'max_depth': [30,20,10],
#     'n_estimators': [200, 300, 100],
#     'bootstrap': [True]
# }
# # Create a based model
# rf = RandomForestRegressor()
# # Instantiate the grid search model
# grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
#                           cv = 3, n_jobs = -1, verbose = 4)
# grid_search.fit(X_train_, y_train)

In [None]:
# grid_search.best_params_


In [None]:
# # grid_search.best_params_
# # best_grid = grid_search.best_estimator_
# # meanSE, medianSE, stdSE, spearmanR, y_pred = evaluate(best_grid, X_test, y_test)
# # print(meanSE, medianSE, stdSE, spearmanR)

# grid_search.best_params_
# best_grid = grid_search.best_estimator_
# meanSE, medianSE, stdSE, spearmanR, y_pred = evaluate(best_grid, X_train_, y_train)
# print(meanSE, medianSE, stdSE, spearmanR)

In [None]:
y_pred2=best_grid.predict(X_test_)
print(y_pred2)
for hh in y_pred2:
    print(hh)

In [6]:
       
y_test_v = []
y_pred_v = []
for yi in y_train:
    if (yi < 300):
        y_test_v.append(0)
    elif (yi >= 300 and yi < 450):
        y_test_v.append(1)
    else:
        y_test_v.append(2)

for yi in y_pred:
    if (yi < 300):
        y_pred_v.append(0)
    elif (yi >= 300 and yi < 450):
        y_pred_v.append(1)
    else:
        y_pred_v.append(2)
        

NameError: name 'y_train' is not defined

In [7]:
from sklearn.metrics import confusion_matrix


cm = confusion_matrix(y_test_v, y_pred_v)

sns.set(font_scale=1.2) 
sns.heatmap(cm, annot=True, annot_kws={"size": 12}) 

plt.show()

ValueError: zero-size array to reduction operation fmin which has no identity

In [36]:
from sklearn.metrics import classification_report

print(classification_report(y_test_v, y_pred_v))

              precision    recall  f1-score   support

           0       1.00      0.32      0.48        41
           1       0.42      0.73      0.53        30
           2       0.75      0.83      0.78        46

    accuracy                           0.62       117
   macro avg       0.72      0.63      0.60       117
weighted avg       0.75      0.62      0.61       117

