# Generate and launch ten times repeated 10-fold cross-validation on multiple GPUs

## Imports

In [2]:
from transformers import AutoModel, AutoTokenizer
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import torch
import torch.nn as nn

import wandb

from tqdm.notebook import tqdm
import time
from collections import Counter
import copy
import random
import pandas as pd
import numpy as np
import os

from development_utils.preprocessing.Get_data_for_model import *
from development_utils.training.Build_data_for_pytorch import *
from development_utils.training.Build_model_pytorch import *
from development_utils.training.PerformanceCalculations import *

In [None]:
device = torch.device("cuda:0") if torch.cuda.is_available() else "cpu"

GPUinfo(device)

GPUs on node: NVIDIA A100-SXM4-40GB
Number of GPUs available: 1
Using cuda:0 device
42.35 Gb free on CUDA


## wandb configuration

In [None]:
ENTITYNAME = 'ecotoxformer'
PROJECTNAME = '100Fold_CV_RDKit'
SWEEPID = 'b9s38isu'

In [None]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mstyrbjornkall[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Define utility functions

In [6]:
def GetData(data, config):
    processor = PreprocessData(dataframe=data)

    processor.FilterData(
        concentration_thresh=config.conc_thresh,
        endpoint=config.endpoints,
        effect=config.effects,
        species_groups=config.species_groups,
        log_data=True,
        concentration_sign=config.concentration_sign)

    processor.GetPubchemCID()
    processor.GetMetadata(['cmpdname'])
    processor.GetCanonicalSMILES()
    processor.GetOneHotEndpoint(config.endpoints)
    processor.GetOneHotEffect(config.effects)
    processor.ConcatenateOneHotEnc()

    data = processor.dataframe
    fc1 = len(data.OneHotEnc_concatenated.iloc[0])
    
    return data, fc1

In [None]:
def SetSeed(seed):
    torch.manual_seed(seed) # pytorch random seed
    torch.cuda.manual_seed_all(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True

In [9]:
def GetLayers(config):
    if config.n_hidden_layers == 1:
        return [config.layer_1]
    elif config.n_hidden_layers == 2:
        return [config.layer_1, config.layer_2]
    elif config.n_hidden_layers == 3:
        return [config.layer_1, config.layer_2, config.layer_3]
    elif config.n_hidden_layers == 4:
        return [config.layer_1, config.layer_2, config.layer_3, config.layer_4]

### Training

In [33]:
def RunTrainingEpochs(data, folds, fold_id, config, fc1, global_step):
    
    chemberta = AutoModel.from_pretrained(config.base_model)
    tokenizer = AutoTokenizer.from_pretrained(config.base_model)

    DataLoaders = BuildDataLoader_KFold(
                                    df = data,
                                    folds = folds,
                                    fold_id=fold_id, 
                                    variables = config.inputs,
                                    label = config.label, 
                                    batch_size = config.batch_size, 
                                    max_length = config.max_token_length, 
                                    seed = config.seed, 
                                    tokenizer = tokenizer)
    
    train_dataloader = DataLoaders.BuildTrainingLoader(sampler_choice=config.sampling_procedure, num_workers=2, weight_args=['SMILES','COMBINED_effect','COMBINED_endpoint'])
    val_dataloader = DataLoaders.BuildValidationLoader(sampler_choice='SequentialSampler', num_workers=2)
    print('Successfully built dataloader')
    print(f'SMILES overlap train/validation: {len(set(DataLoaders.train.SMILES.tolist())&set(DataLoaders.val.SMILES.tolist()))}')
    
    wandb.log({"Training df": wandb.Table(dataframe=DataLoaders.train)})
######## MODEL ##################################################################################
    dnn_module = DNN_module(
                        one_hot_enc_len=fc1,
                        n_hidden_layers=config.n_hidden_layers,
                        layer_sizes=GetLayers(config),
                        dropout=config.dropout,
                        activation='ReLU')

    model = fishbAIT(roberta=chemberta, dnn=dnn_module)

    model = Modify_architecture(model).FreezeModel(model, config.n_frozen_layers, config.freeze_embedding)
    model = Modify_architecture(model).ReinitializeEncoderLayers(model, reinit_n_layers=config.reinit_n_layers)
    model = model.to(device)

######## TRAINING CONFIG ##################################################################################        
    model_parameters = Modify_architecture(model).LLRD(model, init_lr = config.lr)

    optimizer = torch.optim.AdamW(model_parameters, lr=config.lr)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.1*config.epochs*len(train_dataloader), num_training_steps=config.epochs*len(train_dataloader))
    print('Successfully built optimizer')

    if config.loss_fun == 'MSELoss':
        loss_fun = nn.MSELoss()
    else:
        loss_fun = nn.L1Loss()


    best_val_loss = np.inf
    best_val_loss_norm = np.inf
    
    batch_num = [0,0]

######## RUN TRAINING ##################################################################################

    avg_loss, avg_loss_norm, median_loss, median_loss_norm, _, batch_num, val_results = evaluate(model, val_dataloader, DataLoaders.val, loss_fun, batch_num, -1, global_step-1)

    if median_loss < best_val_loss:
        best_val_loss = median_loss
    
    print("\nRunning epochs...")
    for epoch in tqdm(range(config.epochs)):

        avg_loss, median_loss, total_preds, total_labels, batch_num = train(config, model, train_dataloader, optimizer, scheduler, loss_fun, batch_num, epoch, global_step)
        
        avg_loss, avg_loss_norm, median_loss, median_loss_norm, _, batch_num, val_results = evaluate(model, val_dataloader, DataLoaders.val, loss_fun, batch_num, epoch, global_step)
        
        if median_loss_norm < best_val_loss_norm:
            best_val_loss = median_loss
            best_val_loss_norm = median_loss_norm
            best_validation_results = val_results
            best_validation_mean_norm_loss = avg_loss_norm

        wandb.log({'Best Validation Median Loss': best_val_loss,
                    'Best Validation Median Loss Normalized': best_val_loss_norm,
                    'Best Validation Mean Loss Normalized': best_validation_mean_norm_loss,
                    'global_step': global_step})
        
        global_step += 1

    wandb.log({"Best Validation Results": wandb.Table(dataframe=best_validation_results)})
        

    del model
    del optimizer
    del loss_fun
    del chemberta
    del tokenizer

    return best_val_loss, best_val_loss_norm, best_validation_mean_norm_loss, global_step, best_validation_results

In [12]:
# function to train the model
def train(args, model, dataloader, optimizer, scheduler, loss_fun, batch_num, epoch, global_step):
    model.train()
    
    print("\nTraining...")
    total_loss = 0
    total_preds=[]
    total_labels=[]
    # iterate over batches
    for step, batch in enumerate(tqdm(dataloader)):
        batch = [r.to(device) for r in batch.values()]
        sent_id, mask, duration, onehot, labels = batch
        
        optimizer.zero_grad()

        preds = model(sent_id, mask, duration, onehot)
        loss = loss_fun(preds, labels)
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        preds = preds.detach().cpu().numpy()
        labels = labels.detach().cpu().numpy()
        total_preds.append(preds)
        total_labels.append(labels)
        
        
        wandb.log({
            "Training Batch Loss": loss.item(),
            "Learning Rate": optimizer.param_groups[0]["lr"], 
            'training batch': batch_num[0]
        })
        batch_num[0] += 1

    # compute the training loss of the epoch
    avg_loss = total_loss / len(dataloader)
    total_preds = np.concatenate(total_preds, axis=0)
    total_labels  = np.concatenate(total_labels, axis=0)
    median_loss = np.median(abs(total_preds - total_labels))

    wandb.log({
        "Training Loss function": avg_loss,
        "Training Mean Loss": np.mean(abs(total_preds - total_labels)), 
        'training epoch': epoch,
        "Training Median Loss": np.median(abs(total_preds - total_labels)),
        "Training RMSE Loss": np.sqrt(np.mean((total_labels-total_preds)**2)),
        'global_step': global_step})
    
    return avg_loss, median_loss, total_preds, total_labels, batch_num

In [13]:
# function for evaluating the model
def evaluate(model, dataloader, dataset, loss_fun, batch_num, epoch, global_step):
    from tqdm.notebook import tqdm
    
    print("\nEvaluating...")
    model.eval()
    total_preds = []
    total_labels = []
    total_loss = 0

    val_results = dataset.copy()
    for step, batch in enumerate(tqdm(dataloader)):
        batch = [t.to(device) for t in batch.values()]

        sent_id, mask, duration, onehot, labels = batch
        with torch.no_grad():
            with torch.cuda.amp.autocast():
                preds = model(sent_id, mask, duration, onehot)
                loss = loss_fun(preds, labels)
            total_loss += loss.item()
            
            preds = preds.detach().cpu().numpy()
            labels = labels.detach().cpu().numpy()
            total_preds.append(preds)
            total_labels.append(labels)
        batch_num[1] += 1

    # compute the validation loss of the epoch
    avg_loss = total_loss/len(dataloader)
    total_preds  = np.concatenate(total_preds, axis=0)
    total_labels  = np.concatenate(total_labels, axis=0)
    val_results['labels'] = total_labels
    val_results['preds'] = total_preds
    val_results['residuals'] = val_results.labels-val_results.preds
    val_results['L1Error'] = abs(total_labels - total_preds)
    median_loss = val_results.L1Error.median()
    val_results_normalized = CalculateWeightedAverage(val_results)
    median_loss_norm = abs(val_results_normalized.residuals).median()
    avg_loss_norm = abs(val_results_normalized.residuals).mean()
    wandb.log({
        "Validation Loss function": avg_loss,
        "Validation Mean Loss": val_results.L1Error.mean(),
        "Validation Median Loss": median_loss,
        "Validation Loss Normalized": median_loss_norm,
        "Validation Mean Loss Normalized": avg_loss_norm,
        "Validation RMSE Loss Normalized": np.sqrt(((val_results_normalized.labels - val_results_normalized.preds)**2).mean()),
        'validation epoch': epoch,
        'global_step': global_step
        })
        
    return avg_loss, avg_loss_norm, median_loss, median_loss_norm, total_preds, batch_num, val_results

## Define Trainer function

In [34]:
def trainer(config=None):
    from tqdm.notebook import tqdm
    # Set random seeds and deterministic pytorch for reproducibility
    SetSeed(42)
    
    # Initialize a new wandb run
    with wandb.init(config=config):

        # If called by wandb.agent, as below, this config will be set by Sweep Controller
        sweepconfig = wandb.config

        datadir = '../data/'

    ######## DATA ##################################################################################
        data = pd.read_csv(datadir+'MSc_Thesis_Cleaned.csv', encoding='windows-1252')
        data, fc1 = GetData(data, sweepconfig)
        print('Successfully loaded data')

        folds = Make_KFolds().Split(data[sweepconfig.inputs[0]], k_folds=sweepconfig.k_folds, seed=sweepconfig.seed)
        print('Successfully built folds')
        name = wandb.run.name
        
        print(f'\n Running fold {sweepconfig.fold_id} using seed {sweepconfig.seed}')
        global_step = 0

        Best_Validation_Median_Loss, Best_Validation_Median_Loss_Normalized, Best_Validation_Mean_Loss_Normalized, global_step, best_validation_results = RunTrainingEpochs(data, folds, sweepconfig.fold_id, sweepconfig, fc1, global_step)

        global_step += 1

## Train the model

In [35]:
sweep_id = '6fscgxpi'
wandb.agent('ecotoxformer/100Fold_CV_RDKit/'+sweep_id, trainer)

[34m[1mwandb[0m: Agent Starting Run: o3hvtaok with config:
[34m[1mwandb[0m: 	architecture: ChemBERTa+DNN
[34m[1mwandb[0m: 	base_model: ['seyonec/PubChem10M_SMILES_BPE_450k']
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	conc_thresh: 500
[34m[1mwandb[0m: 	concentration_sign: =
[34m[1mwandb[0m: 	dataset: large
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	effects: ['MOR', 'DVP', 'POP', 'MPH', 'ITX', 'REP', 'GRO']
[34m[1mwandb[0m: 	endpoints: ['EC50', 'EC10', 'NOEC']
[34m[1mwandb[0m: 	epochs: 40
[34m[1mwandb[0m: 	fold_id: 1
[34m[1mwandb[0m: 	freeze_embedding: False
[34m[1mwandb[0m: 	inputs: ['SMILES', 'COMBINED_Duration_Value', 'OneHotEnc_concatenated']
[34m[1mwandb[0m: 	k_folds: 10
[34m[1mwandb[0m: 	label: COMBINED_mgperL
[34m[1mwandb[0m: 	layer_1: 700
[34m[1mwandb[0m: 	layer_2: 480
[34m[1mwandb[0m: 	layer_3: 300
[34m[1mwandb[0m: 	loss_fun: L1Loss
[34m[1mwandb[0m: 	lr: 0.0003
[34m[1mwandb[0m: 	max_token_length: 10

Dropped 221 entries from dataframe due to SMILES not having CID
Renamed EC10 *NOEC* in 128 positions
Did not return onehotencoding for Species groups. Why? You specified only one species group.
Did not return onehotencoding for Species classes. Why? No specified classes.
Successfully loaded data
Successfully built folds

 Running fold 1 using seed 41


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
Some weights of the model checkpoint at seyonec/PubChem10M_SMILES_BPE_450k were not used when initializing RobertaModel: ['lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Built training dataloader with 61520 samples
Built validation dataloader with 8747 samples
Successfully built dataloader
SMILES overlap train/validation: 7
Successfully built optimizer

Evaluating...


  0%|          | 0/18 [00:00<?, ?it/s]


Running epochs...


  0%|          | 0/40 [00:00<?, ?it/s]


Training...


  0%|          | 0/121 [00:00<?, ?it/s]

[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
