In [20]:
import sys
sys.path.append('../../../src')
import warnings
warnings.filterwarnings("ignore")


import os
import pandas as pd
import numpy as np
import torch
import pyreadr
import config
import Dataset
import time
import train_MLP
import argparse
import utils
import joblib

from Dataset import CNS
from torch.utils.data import DataLoader
# from Model import DNAMLP
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from utils import make_ndarray_from_csv, get_int_label, brier_score_tensor
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score , confusion_matrix
from torch.nn.functional import softmax, one_hot
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.utils.class_weight import compute_class_weight

In [21]:
import torch
import torch.nn as nn

class DNAMLP(nn.Module):
    def __init__ (self, in_features, n_classes):
        super(DNAMLP, self).__init__()
        self.in_features = in_features
        self.n_classes = n_classes
        self.densenet = nn.Sequential(
            nn.Linear(self.in_features, 8),
            nn.Dropout(p = 0.8),
            nn.Linear(8, self.n_classes)
        )
        self.mlp = nn.Sequential(
            nn.Linear(self.in_features, self.n_classes)
        )
    def forward (self, x):
        # x = self.densenet(x)
        x = self.mlp(x)
        return x

In [22]:
def train_epoch(epoch, model, train_loader, criterion, optimizer, device):
    correct = 0
    total = 0
    total_loss = 0
    total_bs = 0
    model.to(device)
    model.train()
    
    # For loop through all batches
    all_labels = []
    all_logits = []
    for features, labels in train_loader:
        # Move tensors to device
        features = features.to(device)
        labels = labels.to(device)
        
        # Zero out gradient
        optimizer.zero_grad()
        
        # Forward pass
        logits = model(features)
        loss = criterion(logits, labels)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        # Evaluation
        total_loss += loss.item()
        _, predicted = logits.max(1)
        correct += predicted.eq(labels).sum().item()
        total += labels.size(0)
        
        # batch BS
        batch_bs = brier_score_tensor(logits, labels)
        total_bs += batch_bs
        
        # save logits and labels to calculate AUC
        for logit, label in zip(logits, labels):
            all_labels.append(label.item())
            all_logits.append(np.array(logit.detach().cpu().numpy()))
        
    # epoch's avrage LL
    train_loss = total_loss / len(train_loader)
    # epoch's average acc & ME
    train_acc = (correct / total) * 100.
    train_me = 100 - train_acc
    # epoch's average BS
    train_bs = total_bs/len(train_loader)
    all_probs = softmax(torch.Tensor(np.array(all_logits)), dim = 1)
    
    all_preds = [np.argmax(prob) for prob in all_probs]
    # epoch's AUC
    train_auc = roc_auc_score(all_labels, all_preds)
    train_precision = precision_score(all_labels, all_preds)
    train_recall = recall_score(all_labels, all_preds)
    train_cfs = confusion_matrix(all_labels, all_preds)
    
    return train_loss, train_acc, train_me, train_bs, train_auc, train_precision, train_recall, train_cfs

def val_epoch(epoch, model, val_loader, criterion, device):
    correct = 0
    total = 0
    total_loss = 0
    total_bs = 0
    model.to(device)
    # For loop through all batches
    with torch.no_grad():
        # For loop through all batches
        all_labels = []
        all_logits = []
        for features, labels in val_loader:
            # Move tensors to device
            features, labels = features.to(device), labels.to(device)
            
            # Forward pass
            logits = model(features)
            
            # Evaluation and batch loss
            loss = criterion(logits, labels)
            total_loss += loss.item()
            _, predicted = logits.max(1)
            correct += predicted.eq(labels).sum().item()
            total  += labels.size(0)
            
            # batch BS
            batch_bs = brier_score_tensor(logits, labels)
            total_bs += batch_bs
            
            # save logits and labels to calculate AUC
            for logit, label in zip(logits,labels):
                all_labels.append(label.item())
                all_logits.append(np.array(logit.detach().cpu().numpy()))
        
        # epoch's average LL
        val_loss = total_loss / len(val_loader)
        # epoch's average acc & ME
        val_acc = (correct / total) * 100
        val_me = (100 - val_acc)
        # epoch's average BS
        val_bs = total_bs/len(val_loader)

        all_probs = softmax(torch.Tensor(np.array(all_logits)), dim = 1)
        
        all_preds = [np.argmax(prob) for prob in all_probs]
        
        # epoch's AUC 
        val_auc = roc_auc_score(all_labels, all_preds)
        val_precision = precision_score(all_labels, all_preds)
        val_recall = recall_score(all_labels, all_preds)
        val_cfs = confusion_matrix(all_labels, all_preds)
         
    return val_loss, val_acc, val_me, val_bs, val_auc, val_precision, val_recall, val_cfs

def run(class_name, fold, train_loader, val_loader, model, criterion, optimizer, config, save):
    history = {'val_accs': [], 'val_losses': [], 'val_precisions': [], 'val_recalls': [], 'val_aucs': []}
    
    model.to(config['device'])
    n_epochs = config['mlp_n_epochs']
    BEST_STATES_DIR= config['MLP_BEST_STATES_DIR']
    BEST_STATE_PATH = os.path.join(BEST_STATES_DIR, class_name, f'{fold}.pth')
    diff_threshold = config['mlp_diff_threshold']
    max_patience = config['mlp_max_patience']
    patience = 0
    
    for epoch in range(1, n_epochs + 1):
        # print(f'Epoch {epoch}/{n_epochs} of fold {fold}')
        
        train_loss, train_acc, train_me, train_bs, train_auc, train_precision, train_recall, train_cfs = train_epoch(epoch, model, train_loader, criterion, optimizer, config['device'])
        val_loss, val_acc, val_me, val_bs, val_auc, val_precision, val_recall, val_cfs = val_epoch(epoch, model, val_loader, criterion, config['device'])
        
        # history['val_accs'].append(val_acc)
        history['val_losses'].append(val_loss)
        # history['val_precisions'].append(val_precision)
        # history['val_recalls'].append(val_recall)
        # history['val_aucs'].append(val_auc)
        
        # print(f'[{class_name.upper()}] - {fold} - {epoch}/{n_epochs}')
        # print('train_loss: %.5f | train_acc: %.3f | train_precision: %.3f | train_recall: %.3f | train_auc: %.3f' % (train_loss, train_acc, train_precision, train_bs, train_auc))
        # print('val_loss: %.5f | val_acc: %.3f | val_precision: %.3f | val_recall: %.3f | val_auc: %.3f' % (val_loss, val_acc, val_precision, val_recall, val_auc))
        
        if val_loss == min(history['val_losses']):
            # get best epoch's resutls
            best_epoch_results = {'train_loss: ': train_loss, 'train_accs': train_acc, 'train_me': train_me, 'train_bs': train_bs, 'train_precision': train_precision, 'train_recall': train_recall, 'train_auc': train_auc, 'train_cfs': train_cfs, 'val_loss: ': val_loss, 'val_accs': val_acc, 'val_me': val_me, 'val_bs': val_bs, 'val_precision': val_precision, 'val_recall': val_recall, 'val_auc': val_auc, 'val_cfs': val_cfs}
            if save.lower() == 'save':
                # print('Lowest validation loss => saving model weights...')
                torch.save(model.state_dict(), BEST_STATE_PATH)
        if len(history['val_losses']) > 1:
            if abs(history['val_losses'][-2] - val_loss) < diff_threshold or history['val_losses'][-2] < val_loss:
                patience = patience + 1
                # print(f'Patience increased to {patience}')
                if patience == max_patience:
                    # print('Early stopping.')
                    break
            else:
                patience = 0
        # print('---------------------------------------------')
    return best_epoch_results

In [23]:
print("Running mlp classifiers")
clf_cfg = config.classifier_config
device = clf_cfg['device']
save = 'no_save'
print(f'root: {config.root_dir}')
print(f"device: {device}")
print(f'save mode: {save}')

folds = utils.inner_folds
groups = utils.positive_groups

fold = folds[np.random.randint(len(folds))]
print(f'fold: {fold}')

Running mlp classifiers
root: D:\#Study\Thesis\Code\Brain_DNA\src\..
device: cpu
save mode: no_save
fold: 2.2


In [24]:
use_SMOTE_values = [True, False]
use_weights_values = [True, False]
selected_metrics = ['train_precision', 'train_recall', 'train_auc', 'train_cfs', 'val_precision', 'val_recall', 'val_auc', 'val_cfs']

In [25]:
print(f'root: {config.root_dir}')
print(f"device: {device}")
print(f'save mode: {save}')
print(f'fold: {fold}')
for group in groups:
    print (f'\n\n========{group.upper()}========')
    # Read from csv to dataframe
    features, labels = make_ndarray_from_csv(group, fold, mode = 'all')
    # Train test split
    train_features, val_features, train_labels, val_labels = train_test_split(features, labels, test_size=0.8, random_state=42)
    value_counts = pd.Series(train_labels).value_counts()
    print(f'{value_counts}')
    for use_SMOTE in use_SMOTE_values:
        for use_weights in use_weights_values:
            if use_weights == True:
                class_weights = torch.Tensor(compute_class_weight(class_weight='balanced', classes=np.unique(labels), y = labels)).to(device)
            else:
                class_weights = None
            new_train_features, new_train_labels = train_features, train_labels
            if use_SMOTE == True:
                smote = SMOTE(sampling_strategy = "auto", random_state = 42, k_neighbors = max(1, min(value_counts) - 1))
                new_train_features, new_train_labels = smote.fit_resample(train_features, train_labels)

            # Encode the labels
            new_train_labels_int = np.array(
                [get_int_label(label, group) for label in new_train_labels])
            val_labels_int = np.array(
                [get_int_label(label, group) for label in val_labels])

            # Create datasets and Dataloaders
            train_dataset = CNS(new_train_features, new_train_labels_int, mode='train')
            val_dataset = CNS(val_features, val_labels_int, mode='val')
            train_loader = DataLoader(
                train_dataset, batch_size=clf_cfg['mlp_train_batch_size'], shuffle=True)
            val_loader = DataLoader(
                val_dataset, batch_size=clf_cfg['mlp_val_batch_size'], shuffle=False)

            # Init model object
            in_features = clf_cfg['n_features']
            model = DNAMLP(in_features, clf_cfg['n_classes'])
            if clf_cfg['MLP_FIRST_TIME'] == False:
                # Load model based on fold
                BEST_STATE_PATH = os.path.join(
                    clf_cfg['MLP_BEST_STATES_DIR'], group, f'{fold}.pth')
                model.load_state_dict(torch.load(BEST_STATE_PATH))

            # Define training and validating hyperparams
            criterion = CrossEntropyLoss(weight=class_weights)
            optimizer = Adam(model.parameters(
            ), lr=clf_cfg['mlp_lr'], weight_decay=clf_cfg['mlp_weight_decay'])
            # print(f'Running in {save} mode')
            best_epoch_results = run(
                group, fold, train_loader, val_loader, model, criterion, optimizer, clf_cfg, save)
            print(f'--------\n-> SMOTE: {use_SMOTE} - WEIGHTS: {use_weights} | Best epoch: ')
            for key, value in best_epoch_results.items():
                if key in selected_metrics:
                    if 'cfs' in key:
                        print(f'\t{key}:\n{value}')
                    else:
                        print(f'\t{key}: {value}')

root: D:\#Study\Thesis\Code\Brain_DNA\src\..
device: cpu
save mode: no_save
fold: 2.2


Embryonal    99
Control      15
dtype: int64
--------
-> SMOTE: True - WEIGHTS: True | Best epoch: 
	train_precision: 1.0
	train_recall: 1.0
	train_auc: 1.0
	train_cfs:
[[99  0]
 [ 0 99]]
	val_precision: 1.0
	val_recall: 1.0
	val_auc: 1.0
	val_cfs:
[[383   0]
 [  0  77]]
--------
-> SMOTE: True - WEIGHTS: False | Best epoch: 
	train_precision: 1.0
	train_recall: 1.0
	train_auc: 1.0
	train_cfs:
[[99  0]
 [ 0 99]]
	val_precision: 0.9871794871794872
	val_recall: 1.0
	val_auc: 0.9986945169712793
	val_cfs:
[[382   1]
 [  0  77]]
--------
-> SMOTE: False - WEIGHTS: True | Best epoch: 
	train_precision: 1.0
	train_recall: 0.9333333333333333
	train_auc: 0.9666666666666667
	train_cfs:
[[99  0]
 [ 1 14]]
	val_precision: 1.0
	val_recall: 0.974025974025974
	val_auc: 0.987012987012987
	val_cfs:
[[383   0]
 [  2  75]]
--------
-> SMOTE: False - WEIGHTS: False | Best epoch: 
	train_precision: 1.0
	train_recall: 1.