In [1]:
##### base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns
import os

# matplotlib
%matplotlib inline

# display
from IPython.display import display

# autoreload
%load_ext autoreload
%autoreload 2

# warnings
import warnings
warnings.filterwarnings('ignore')

# fix random seed
from numpy.random import seed as set_random_seed
set_random_seed(42)

# explainability
# import shap, lime #eli5
# shap.initjs()

# debug
# from icecream import ic
# debug = ic
import logging
logg = logging.info

In [2]:
import os, sys, re
import argparse
import time
import random
import logging

from dataclasses import dataclass

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from torch._C import device
from torch.utils.data import DataLoader
import torch

from modules.train import train
from modules.test import test
from modules.loader import CustomBamDataset2
import utils.utils as util
from utils.utils import Parms

In [3]:
parser = argparse.ArgumentParser('Train or test SomaticSerum model.')
parser.add_argument('training_bam_dir', type=str,
                    help='Train data bams directory')
parser.add_argument('--sample_split', required=False, type=str,
                    help='How to split the training data: True - by samples, False - by random on the entire dataset',
                    default = 'True')
parser.add_argument('--model', required=False, type=str, 
                    help='model', default='SimpleCnn')
parser.add_argument('--hidden_size', required=False, type=int,
                    help='The number of hidden units', default=64)
parser.add_argument('--sequence_length', required=False, type=int,
                    help='The length of the sequence', default=200)
parser.add_argument('--batch_size', required=False, type=int,
                    help='The size of each batch', default=512)
parser.add_argument('--learning_rate', required=False, type=float,
                    help='The learning rate value', default=0.00001)
parser.add_argument('--max_epoch', required=False, type=int,
                    help='The maximum epoch', default=100)
parser.add_argument('--lstm_layers', required=False, type=int,
                    help='Num of LSTM layers', default=10)
parser.add_argument('--dropout', required=False, type=float,
                    help='Dropout', default=0.5)
parser.add_argument('--num_workers', required=False, type=int,
                    help='Number of workers', default=1)
parser.add_argument('--out', required=False, type=str,
                    help='Output directory', default='output')
parser.add_argument('--test', required=False, type=str,
                    help='Test directory')
# args = parser.parse_args()

args = parser.parse_known_args()[0]
args.__dict__.update(dict(
#         training_bam_dir="../data/seqmerge/DLbams_rand",
        training_bam_dir='/data/alonwolf/projects/SomaticSerum/data/seqmerge/DLbams_rand',
        sample_split=True,
        model="CnnLinear",
        hidden_size=64,
        batch_size=2,
        learning_rate=0.00001,
        max_epoch=100,
        dropout=0.005,
#         out=MassiveLoop3,
    ))
parms = Parms(
    args
)

root        : INFO     Logger initialized


In [4]:
full_train_dataset  = CustomBamDataset2(parms.BAM_DIR, out = parms.OUT, whichSet = 'train')
train_size          = int(parms.TRAIN_VALIDATION_SPLIT * len(full_train_dataset))
val_size            = len(full_train_dataset) - train_size
train_dataset, valid_dataset = torch.utils.data.random_split(full_train_dataset, [train_size, val_size])


root        : INFO     Dataloading train


File ../results/output/data_train.pkl exists - using as train


root        : INFO     Number of Normal reads: 159534, Number of Somatic reads: 168989. Ratio: 0.944


In [5]:
train_dataloader = DataLoader(train_dataset, batch_size = parms.BATCH_SIZE, shuffle = True, num_workers = parms.NUM_WORKERS)
valid_dataloader = DataLoader(valid_dataset, batch_size = parms.BATCH_SIZE, shuffle = True, num_workers = parms.NUM_WORKERS)

In [6]:
# --- Model setup ----------------------------------------------------
logging.info('Setting up the model...')
nucleotide_model    = parms.return_model()
loss_fn             = torch.nn.CrossEntropyLoss()
model_params        = list(nucleotide_model.parameters())
optimizer           = torch.optim.AdamW(model_params, lr=parms.LEARNING_RATE, eps=1e-08, weight_decay=0.01)


root        : INFO     Setting up the model...


In [7]:
torch.backends.cudnn.enabled = False

In [None]:
# --- Training -------------------------------------------------------
logging.info('Setting up the model...')
if torch.cuda.is_available():
    nucleotide_model = nucleotide_model.cuda()
    device = 'cuda'
else:
    device = 'cpu'
logg(device)

if parms.TEST:
    # Skip training
    logging.info('Skipping train')
    metric = pd.read_csv('{}/metric.csv'.format(parms.TEST))
    name = '{}/{}'.format(parms.TEST, [f for f in os.listdir(parms.TEST) if f.endswith('.txt')][0].split('txt')[0])
    model_path = "{}/{}.pth".format(parms.TEST, type(nucleotide_model).__name__)
    metric_test = test(parms, model_path, test_dataloader, device, loss_fn)
else:
    logging.info('Training...')
    history, name, metric, model_path = train(model = nucleotide_model,
                                optimizer = optimizer,
                                loss_fn = loss_fn,
                                train_dl = train_dataloader,
                                val_dl = valid_dataloader,
                                epochs = parms.MAX_EPOCH,
                                device = device,
                                out = parms.OUT)
    # Test
    metric_test = test(parms, model_path, test_dataloader, device, loss_fn)

# --- Plotting -------------------------------------------------------
logging.info('Plotting...')
util.plot(name, metric, metric_test, parms.OUT)

# acc = history['acc']
# val_acc = history['val_acc']
# loss = history['loss']
# val_loss = history['val_loss']
# epochs = range(1, len(acc) + 1)

logging.info("Done!")

root        : INFO     Setting up the model...
root        : INFO     cuda
root        : INFO     Training...
root        : INFO     train() called: model=CnnLinear, opt=AdamW(lr=0.000010), epochs=100, dropout=0.005000 device=cuda
root        : INFO     Epoch   1/100, train loss: 0.6901, train acc: 0.5276, val loss: 0.6844, val acc: 0.5648
root        : INFO     Epoch   2/100, train loss: 0.6725, train acc: 0.5846, val loss: 0.6576, val acc: 0.6116
root        : INFO     Epoch   3/100, train loss: 0.6298, train acc: 0.6725, val loss: 0.5909, val acc: 0.7285
root        : INFO     Epoch   4/100, train loss: 0.5719, train acc: 0.7397, val loss: 0.5555, val acc: 0.7511
root        : INFO     Epoch   5/100, train loss: 0.5449, train acc: 0.7635, val loss: 0.5399, val acc: 0.7701
root        : INFO     Epoch   6/100, train loss: 0.5387, train acc: 0.7706, val loss: 0.5385, val acc: 0.7677
root        : INFO     Epoch   7/100, train loss: 0.5366, train acc: 0.7728, val loss: 0.5363, val acc:

In [8]:
# torch.backends.cudnn.enabled = False