In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [2]:
import sys
sys.path.append('../../')

import os
import pandas as pd
import numpy as np
import config
import utils
import torch
import pickle
import train_impf_MLP

from Model import Impf_DNAMLP
from torch.nn.functional import softmax
from utils import impf_make_ndarray_from_csv, get_int_label
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from Dataset import CNS

In [3]:
print(utils.low_perf_groups)

['Glio-neuronal', 'Other glioma']


In [4]:
inner_fold = '3.4'
outer_fold = f"{inner_fold.split('.')[0]}.0"
group = 'Sella'
alg = 'MLP'
print(outer_fold)

3.0


In [5]:
dat_cfg = config.data_config
clf_cfg = config.classifier_config
impf_cfg = config.impf_config
TRAIN_CSV_DIR = dat_cfg['TRAIN_CSV_DIR']
TEST_CSV_DIR = dat_cfg['TEST_CSV_DIR']

INNER_FOLD_CSV = os.path.join(TEST_CSV_DIR, group, f'{inner_fold}_test.csv')
OUTER_FOLD_CSV = os.path.join(TEST_CSV_DIR, group, f'{outer_fold}_test.csv')
device = impf_cfg['device']
device

'cpu'

In [6]:
with open(os.path.join(impf_cfg['IMPORTANT_FEATURES_DIR'], alg, group, f'{outer_fold}_combined.pkl'), 'rb') as handle:
    impf = pickle.load(handle)
print(len(impf))

95


In [7]:
impf

{'cg00074145',
 'cg00110654',
 'cg00257271',
 'cg00875805',
 'cg01663018',
 'cg01730970',
 'cg01990910',
 'cg02046552',
 'cg02053678',
 'cg02380531',
 'cg02846841',
 'cg03035213',
 'cg03126579',
 'cg03184290',
 'cg03287940',
 'cg03531247',
 'cg03673965',
 'cg04046669',
 'cg04138112',
 'cg04193970',
 'cg04224064',
 'cg04883026',
 'cg05021743',
 'cg05634637',
 'cg06952416',
 'cg07197230',
 'cg07377422',
 'cg07921371',
 'cg07924363',
 'cg07961887',
 'cg08409113',
 'cg08465307',
 'cg08490663',
 'cg09059945',
 'cg09140778',
 'cg09235583',
 'cg09599130',
 'cg09671951',
 'cg09803262',
 'cg09851951',
 'cg09954385',
 'cg10200388',
 'cg10585371',
 'cg11122009',
 'cg11667086',
 'cg11904906',
 'cg12177743',
 'cg12414557',
 'cg12749132',
 'cg13356117',
 'cg13711457',
 'cg14210607',
 'cg14786652',
 'cg14826683',
 'cg14839134',
 'cg14859874',
 'cg14866595',
 'cg15030449',
 'cg15316843',
 'cg16810605',
 'cg17170948',
 'cg17298239',
 'cg17521583',
 'cg17602481',
 'cg17898329',
 'cg18082337',
 'cg181831

In [8]:
df_inner = pd.read_csv(INNER_FOLD_CSV, index_col=0)
df_outer = pd.read_csv(OUTER_FOLD_CSV, index_col=0)
df_inner.shape, df_outer.shape

((42, 10002), (59, 10002))

In [9]:
pd.Series(pd.read_csv(OUTER_FOLD_CSV, index_col=0).iloc[:, -1]).value_counts()

Sella      34
Control    25
Name: class_group, dtype: int64

## Load random test sample and predict

In [10]:
rand = np.random.randint(df_outer.shape[0])
sample = df_outer.iloc[rand, :]
rand, sample

(16,
 cg22054918     0.003023
 cg16476975     0.015417
 cg25570913     0.052471
 cg03940848     0.218212
 cg02809746     0.010131
                  ...   
 cg22513691     0.885517
 cg00743629     0.887552
 cg13621317     0.571651
 label          CPH, ADM
 class_group       Sella
 Name: 9305651095_R06C02, Length: 10002, dtype: object)

In [11]:
sample = df_outer.loc[:, sorted(list(impf))].iloc[rand,:]
label = get_int_label(df_outer.iloc[rand, -1], group)
sample, sample.shape, label

(cg00074145    0.073633
 cg00110654    0.832168
 cg00257271    0.143880
 cg00875805    0.630745
 cg01663018    0.017201
                 ...   
 cg26147845    0.037444
 cg26282566    0.101960
 cg26718878    0.958998
 cg26855801    0.367435
 cg27025079    0.167244
 Name: 9305651095_R06C02, Length: 95, dtype: float64,
 (95,),
 0)

In [12]:
sample_features = np.expand_dims(np.array(sample), axis = 0)
sample_features
# sample_features, type(sample_features), sample_features.shape, type(sample_features[0][0])

array([[0.07363336, 0.83216783, 0.14387974, 0.63074546, 0.01720063,
        0.87658444, 0.98564828, 0.93190735, 0.06974955, 0.01777707,
        0.1530839 , 0.06359347, 0.55286491, 0.00652664, 0.51600815,
        0.09093915, 0.13349749, 0.81183473, 0.02090292, 0.75543074,
        0.06429633, 0.07984135, 0.89097782, 0.51151387, 0.31831456,
        0.09743342, 0.93598179, 0.04297773, 0.58911355, 0.04677368,
        0.92447153, 0.77980952, 0.02185792, 0.378664  , 0.0106032 ,
        0.01094969, 0.13618722, 0.50108125, 0.84347241, 0.86602984,
        0.03011065, 0.06064844, 0.64188756, 0.19855645, 0.03433018,
        0.78489913, 0.01469992, 0.30805432, 0.98878085, 0.02231413,
        0.0241389 , 0.12427287, 0.98016599, 0.39761088, 0.9345365 ,
        0.45315586, 0.05690367, 0.39417832, 0.07335237, 0.04957181,
        0.79270533, 0.05151639, 0.03044204, 0.84798813, 0.05680191,
        0.05325011, 0.31372689, 0.98036865, 0.01680372, 0.96013527,
        0.02844359, 0.06346109, 0.05894591, 0.09

In [13]:
IMPF_MLP_STATE_FILE = os.path.join(impf_cfg['MLP_BEST_STATES_DIR'], alg, group, f'{inner_fold}.pth')

n_classes = impf_cfg['n_classes']
in_features = len(impf)

model = Impf_DNAMLP(in_features, n_classes)
if torch.cuda.is_available():
    model.load_state_dict(torch.load(IMPF_MLP_STATE_FILE))
else:
    model.load_state_dict(torch.load(IMPF_MLP_STATE_FILE, map_location='cpu'))
model.to(device)

Impf_DNAMLP(
  (densenet): Sequential(
    (0): Linear(in_features=95, out_features=2, bias=True)
  )
)

In [14]:
sample_features.shape

(1, 95)

In [15]:
probs = softmax(model((torch.Tensor(sample_features).to(device))), dim = 1)
probs

tensor([[0.8321, 0.1679]], grad_fn=<SoftmaxBackward0>)

In [16]:
# epoch = 1
# test_features, test_labels = impf_make_ndarray_from_csv(group, outer_fold, impf, mode = 'test')
# test_labels_int = np.array([get_int_label(label, group) for label in test_labels])
# test_dataset = CNS(test_features, test_labels_int, mode = 'val')
# test_loader = DataLoader(test_dataset, batch_size = impf_cfg['mlp_val_batch_size'], shuffle = False)
# criterion = CrossEntropyLoss()
# optimizer = Adam(model.parameters(), lr = impf_cfg['mlp_lr'], weight_decay = impf_cfg['mlp_weight_decay'])
# res = train_impf_MLP.impf_val_epoch(epoch, model, test_loader, criterion, impf_cfg['device'])
# test_loss, test_acc, test_me, test_bs, test_auc, test_f1, test_precision, test_recall, test_cfs = res

In [17]:
# res

In [18]:
# train_features, train_labels = impf_make_ndarray_from_csv(group, inner_fold, impf, mode = 'train')
# val_features, val_labels = impf_make_ndarray_from_csv(group, inner_fold, impf, mode = 'test')
# test_features, test_labels = impf_make_ndarray_from_csv(group, outer_fold, impf, mode = 'test')

# train_labels_int = np.array([get_int_label(label, group) for label in train_labels])
# val_labels_int = np.array([get_int_label(label, group) for label in val_labels])
# test_labels_int = np.array([get_int_label(label, group) for label in test_labels])

# train_dataset = CNS(train_features, train_labels_int, mode = 'train')
# val_dataset = CNS(val_features, val_labels_int, mode = 'val')
# test_dataset = CNS(test_features, test_labels_int, mode = 'val')
# train_loader = DataLoader(train_dataset, batch_size = impf_cfg['mlp_train_batch_size'], shuffle = True)
# val_loader = DataLoader(val_dataset, batch_size = impf_cfg['mlp_val_batch_size'], shuffle = False)
# test_loader = DataLoader(test_dataset, batch_size = impf_cfg['mlp_val_batch_size'], shuffle = False)

# # Init model object
# in_features = len(impf)
# if group in utils.low_perf_groups:
#     model = Impf_GlioMLP(in_features, impf_cfg['n_classes'])
# else:
#     model = Impf_DNAMLP(in_features, impf_cfg['n_classes'])

In [19]:
# %%capture
# all_res = {}
# for fold in utils.inner_folds:
#     res = train_impf_MLP.impf_run(group, alg, fold, train_loader, val_loader, test_loader, model, criterion, optimizer, impf_cfg, 'no_save')
#     all_res[f'{fold}'] = res

In [20]:
# all_res