In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%cd gdrive/MyDrive/TransferLearningToolkit/

/content/gdrive/MyDrive/TransferLearningToolkit


In [3]:
!pip install transformers
!pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 27.6 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 55.5 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 73.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchmetrics
  Downloading torchmetrics-0.10.3-py3-none-any.whl (529 kB)
[K     |████████████████████████████████| 529 

In [8]:
import torch
from torch.utils.tensorboard import SummaryWriter
import argparse
import yaml
import os
import time, datetime
from tqdm import tqdm
import transformers
import random
import sys
from lib.evaluate import evaluate
from lib.select_model import select_model
from lib.utils import *
from parameter_efficient.parallel_adapter import Model_with_parallel_adapter
from fine_tuning.ULMFiT import Model_with_ULMFiT



def train(
    model_obj = None,
    train_loader = None,
    val_loader = None,
    actual_batch_size = 15,
    epochs = 20,
    base_lr = 0.001,
    weight_decay = 0.001,
    scheduler = None,
    warmup_steps = 5000,
    model_save_name = None,
    save_model_freq = 300,
    val_freq = 100,
    dropout = 0.1,
    write_logs = True,
    logs_folder = 'runs',
    delete_logs = True,
    load_existing_model = True,
    ):
  

    if model_obj is None:
        raise Exception("Model object was not provided.")
    if train_loader is None or val_loader is None:
        raise Exception("train_loader or val_loader were not provided.")
    if model_save_name is None:
        raise Exception("Path to directory for saving model was not provided.")

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    base_model = model_obj.base_model
    model_size = model_obj.model_size

    if hasattr(model_obj, 'model'):
        model = model_obj.model
    else:
        model = model_obj

    if write_logs:
        writer = SummaryWriter(logs_folder)

    if delete_logs:
        # Delete all the logs
        for root, dirs, files in os.walk(logs_folder):
            for file in files:
                os.remove(os.path.join(root, file))
        print("Logs deleted")

    if load_existing_model:
        state_dict = torch.load(model_save_name)
        model.load_state_dict(state_dict)
        print("Existing model loaded")

    model = model.to(device)

    optimizer = get_optimizer(model, model_obj, base_lr, weight_decay)
    scheduler = select_scheduler(scheduler, model_obj.technique, optimizer, len(train_loader), epochs, actual_batch_size, warmup_steps)

    model.train()
    model_obj.model = model

    seq_count = 0
    start = time.time()


    for epoch in range(epochs):

        print(f"EPOCH {epoch} started" + '=' * 30)

        for train_counter, train_batch in enumerate(train_loader, 0):

            tokens = train_batch.to(device)
            tokens = process_tokens(tokens, device, model_obj.technique)
            outputs = model(tokens, labels=tokens)
            loss = outputs[0]
            loss.backward()
            seq_count += 1

            print('[%d, %5d] train loss: %.5f' % (epoch + 1, seq_count, loss.detach().data))
            if write_logs:
                writer.add_scalar("train_loss", float(loss.detach().data), seq_count)

            # Resorting to this approach as processing input sequences in 
            # parallel would often cause 'CUDA out of memory'
            if seq_count % actual_batch_size == 0:
              optimizer.step()
              scheduler.step()
              optimizer.zero_grad()
              model.zero_grad()
              model = batch_routine(model, model_obj)

            if seq_count % save_model_freq == 0:
                torch.save(model.state_dict(), model_save_name)

            if seq_count % val_freq == 0:
                validate(model, model_obj, val_loader, device, seq_count // val_freq, writer)
                print("Time elapsed:", str(datetime.timedelta(seconds=time.time() - start)))
                model.train()

            if hasattr(model_obj, 'model'):
                model_obj.model = model

            model = modify(model, model_obj, seq_count)
            

            


def validate(model, model_obj, val_loader, device, batch_count, writer):

    model.eval()
    counter = 0
    with torch.no_grad():
        running_loss = 0

        for val_counter, val_batch in enumerate(tqdm(val_loader), 0):

            tokens = val_batch.to(device)
            tokens = process_tokens(tokens, device, model_obj.technique)
            outputs = model(tokens, labels=tokens)
            loss = outputs[0]
            counter += 1
            running_loss += loss

        print('[%d       ] validation loss: %.5f' % (batch_count,
                                                     running_loss / len(val_loader)))
        writer.add_scalar('val loss', running_loss / len(val_loader),
                          batch_count)






def arg_parser():

    with open('config.yml', 'r') as file:
        args = yaml.safe_load(file)

    parser = argparse.ArgumentParser()
    parser.add_argument('--batch_size', type=int, default=args['batch_size'], help='Batch size for dataloaders')
    parser.add_argument('--actual_batch_len', type=int, default=args['actual_batch_len'], help='Actual batch size')
    parser.add_argument('--num_workers', type=int, default=args['num_workers'], help='Number of workers')
    parser.add_argument('--warmup_steps', type=int, default=args['warmup_steps'], help='Warmup steps')
    parser.add_argument('--lr', type=int, default=args['base_lr'], help='Learning rate')
    parser.add_argument('--reset_tb', type=bool, default=args['reset_tb'], help='Reset tensorboard')
    parser.add_argument('--write_logs', type=bool, default=args['write_logs'], help='Write logs to tensorboard')
    parser.add_argument('--epochs', type=int, default=args['epochs'], help='Number of epochs of training')
    parser.add_argument('--unfreeze_qty', type=int, default=args['unfreeze_qty'], help='Number of layers to unfreeze at a time')
    parser.add_argument('--dlr_factor', type=int, default=args['dlr_factor'], help='Discriminative learning rate decay factor')
    parser.add_argument('--base_model', type=str, default=args['base_model'], help='Base model')
    parser.add_argument('--model', type=str, default=args['model'], help='Model name')
    parser.add_argument('--dataset_path', type=str, default=args['dataset_path'], help='Relative path to the dataset')
    parser.add_argument('--model_size', type=str, default=args['model_size'], help="Model size options: '', 'medium', 'large'")
    parser.add_argument('--weight_decay', type=int, default=args['weight_decay'], help='Weight decay coefficient')
    parser.add_argument('--dropout', type=int, default=args['dropout'], help='Dropout rate')
    parser.add_argument('--unfreeze_freq', type=int, default=args['unfreeze_freq'], help='Frequency of unfreezing model layers')
    parser.add_argument('--scheduler', type=str, default=args['scheduler'], help='Name of the scheduler')
    parser.add_argument('--gradual_unfreezing', type=bool, default=args['gradual_unfreezing'], help='Gradual unfreezing switch')
    parser.add_argument('--chain_thaw', type=bool, default=args['chain_thaw'], help='Chain thaw switch')
    parser.add_argument('--apply_dlr', type=bool, default=args['apply_dlr'], help='Apply discriminative learning rate')
    parser.add_argument('--biases_only', type=bool, default=args['biases_only'], help='Fine tune bias parameters only')
    parser.add_argument('--block_size', type=int, default=args['block_size'], help='Tokenizer block size')
    parser.add_argument('--freeze_init_layers', type=int, default=args['freeze_init_layers'], help='Freeze first n layers')
    parser.add_argument('--freeze_nth_layer', type=int, default=args['freeze_nth_layer'], help='Freeze nth layer') 
    parser.add_argument('--val_freq', type=int, default=args['val_freq'], help='Number of training steps between validation')
    parser.add_argument('--eval_model_freq', type=int, default=args['eval_model_freq'], help='Evaluate model frequency')
    parser.add_argument('--save_model_freq', type=int, default=args['save_model_freq'], help='Log frequency')
    parser.add_argument('--gen_max_len', type=int, default=args['gen_max_len'], help='Maximum length of output generated')
    parser.add_argument('--load_existing_model', type=int, default=args['load_existing_model'], help='(bool) load the existing model')
    parser.add_argument('--percent_of_val', type=int, default=args['percent_of_val'],
                        help='percentage of the validation set that will '
                             'be evaluated, type int')
    parser.add_argument('--model_save_name', type=str, default=args['model_save_name'], help='Saved model name')
    parser.add_argument('--logs_base_run', type=str, default=args['logs_base_run'], help='Tensorboard logs base folder name')
    parser.add_argument('--test_split', type=float, default=args['test_split'], help='Test split ratio (from val set)')
    parser.add_argument('--train_split', type=float, default=args['train_split'], help='Train split ratio')
    parser.add_argument('--test_split_token', type=str, default=args['test_split_token'], help='Token to split input into input and output for test set')
    parser.add_argument('--test_end_token', type=str, default=args['test_end_token'], help='Token at the end of input/output pair')
    

    return parser



if __name__ == "__main__":

    model_obj = Model_with_parallel_adapter()
    train_loader, val_loader, test_loader = load_dataloaders(dataset_path = 'data/SAWorks.txt')
    train(model_obj, train_loader, val_loader, model_save_name='model/model.pth')
    metrics = evaluate(test_loader, model_obj, None, None, 'model/model.pth', 100)





Logs deleted
Existing model loaded
[1,     1] train loss: 6.80513
[1,     2] train loss: 6.14021
[1,     3] train loss: 6.45855
[1,     4] train loss: 6.61598
[1,     5] train loss: 6.48633
[1,     6] train loss: 6.54489
[1,     7] train loss: 6.24736
[1,     8] train loss: 6.17829
[1,     9] train loss: 6.12566
[1,    10] train loss: 6.17078
[1,    11] train loss: 6.29748
[1,    12] train loss: 6.31184
[1,    13] train loss: 6.29710
[1,    14] train loss: 7.23446
[1,    15] train loss: 6.38933
[1,    16] train loss: 6.09216
[1,    17] train loss: 6.63264
[1,    18] train loss: 7.10049
[1,    19] train loss: 6.36003
[1,    20] train loss: 7.40306
[1,    21] train loss: 5.92455
[1,    22] train loss: 6.33087
[1,    23] train loss: 7.02718
[1,    24] train loss: 6.50044
[1,    25] train loss: 6.14208
[1,    26] train loss: 7.14556
[1,    27] train loss: 6.16325
[1,    28] train loss: 6.31783
[1,    29] train loss: 6.25117
[1,    30] train loss: 6.54094
[1,    31] train loss: 6.29799
[1, 

100%|██████████| 90/90 [00:10<00:00,  8.30it/s]


[1       ] validation loss: 6.39170
Time elapsed: 0:00:39.274829
[1,   101] train loss: 6.08183
[1,   102] train loss: 6.43237
[1,   103] train loss: 6.14569
[1,   104] train loss: 5.96955
[1,   105] train loss: 7.21087
[1,   106] train loss: 6.77217
[1,   107] train loss: 7.26424
[1,   108] train loss: 6.13122
[1,   109] train loss: 6.29244
[1,   110] train loss: 6.35110
[1,   111] train loss: 6.13658
[1,   112] train loss: 6.28882
[1,   113] train loss: 6.88324
[1,   114] train loss: 7.12477
[1,   115] train loss: 6.65692
[1,   116] train loss: 7.36114
[1,   117] train loss: 6.38256
[1,   118] train loss: 6.03209
[1,   119] train loss: 6.56385
[1,   120] train loss: 6.37020
[1,   121] train loss: 7.01341
[1,   122] train loss: 6.41341
[1,   123] train loss: 6.29044
[1,   124] train loss: 6.23307
[1,   125] train loss: 6.08246
[1,   126] train loss: 7.41094
[1,   127] train loss: 6.43902
[1,   128] train loss: 6.77297
[1,   129] train loss: 6.07988
[1,   130] train loss: 6.29509
[1,  

100%|██████████| 90/90 [00:10<00:00,  8.58it/s]


[2       ] validation loss: 6.38378
Time elapsed: 0:01:18.733142
[1,   201] train loss: 6.15344
[1,   202] train loss: 7.19584
[1,   203] train loss: 6.38815
[1,   204] train loss: 6.27020
[1,   205] train loss: 6.11737
[1,   206] train loss: 6.14426
[1,   207] train loss: 6.25988
[1,   208] train loss: 6.42872
[1,   209] train loss: 6.50953
[1,   210] train loss: 6.03532
[1,   211] train loss: 7.10961
[1,   212] train loss: 6.04222
[1,   213] train loss: 6.01698
[1,   214] train loss: 6.31205
[1,   215] train loss: 6.63651
[1,   216] train loss: 7.54745
[1,   217] train loss: 5.91494
[1,   218] train loss: 6.31993
[1,   219] train loss: 6.21073
[1,   220] train loss: 6.14137
[1,   221] train loss: 6.21005
[1,   222] train loss: 6.33918
[1,   223] train loss: 6.26132
[1,   224] train loss: 6.08982
[1,   225] train loss: 6.68834
[1,   226] train loss: 6.11282
[1,   227] train loss: 6.69171
[1,   228] train loss: 6.41035
[1,   229] train loss: 6.98597
[1,   230] train loss: 6.22328
[1,  

100%|██████████| 90/90 [00:10<00:00,  8.40it/s]


[3       ] validation loss: 6.36931
Time elapsed: 0:01:59.992380
[1,   301] train loss: 7.34285
[1,   302] train loss: 6.29071
[1,   303] train loss: 6.63292
[1,   304] train loss: 6.42075
[1,   305] train loss: 6.61776
[1,   306] train loss: 6.29215
[1,   307] train loss: 6.49406
[1,   308] train loss: 6.20427
[1,   309] train loss: 5.99798
[1,   310] train loss: 7.16083
[1,   311] train loss: 6.11954
[1,   312] train loss: 6.32473
[1,   313] train loss: 6.37439
[1,   314] train loss: 6.40355
[1,   315] train loss: 6.90515
[1,   316] train loss: 6.43295
[1,   317] train loss: 6.10808
[1,   318] train loss: 7.18850
[1,   319] train loss: 6.16620
[1,   320] train loss: 6.01378
[1,   321] train loss: 6.19963
[1,   322] train loss: 6.27670
[1,   323] train loss: 6.30661
[1,   324] train loss: 6.62276
[1,   325] train loss: 6.23211
[1,   326] train loss: 7.05656
[1,   327] train loss: 7.43256
[1,   328] train loss: 6.30725
[1,   329] train loss: 6.25284
[1,   330] train loss: 6.04700
[1,  

KeyboardInterrupt: ignored

In [None]:
!git init

Initialized empty Git repository in /content/gdrive/MyDrive/TransferLearningToolkit/.git/


In [None]:
!git config --global user.email "vibhud04@gmail.com"
!git config --global user.name "Vibhu04"

In [None]:
!git status

On branch master
nothing to commit, working tree clean


In [None]:
!git add -A

In [None]:
!git commit -m "Added comments, cleaned up code"

[master (root-commit) c5584a6] Added comments, cleaned up code
 98 files changed, 7131 insertions(+)
 create mode 100644 .idea/.gitignore
 create mode 100644 .idea/TransferLearningToolkit.iml
 create mode 100644 .idea/inspectionProfiles/profiles_settings.xml
 create mode 100644 .idea/misc.xml
 create mode 100644 .idea/modules.xml
 create mode 100644 __pycache__/gpt2_with_adapter.cpython-38.pyc
 create mode 100644 __pycache__/main.cpython-38.pyc
 create mode 100644 __pycache__/select_model.cpython-37.pyc
 create mode 100644 __pycache__/utils.cpython-37.pyc
 create mode 100644 config.yml
 create mode 100644 data/SAWorks.txt
 create mode 100644 data/__pycache__/dataset.cpython-37.pyc
 create mode 100644 data/__pycache__/dataset.cpython-38.pyc
 create mode 100644 data/dataset.py
 create mode 100644 fine_tuning/__pycache__/stlr.cpython-37.pyc
 create mode 100644 fine_tuning/__pycache__/utils.cpython-37.pyc
 create mode 100644 fine_tuning/stlr.py
 create mode 100644 fine_tuning/utils.py
 cre

In [None]:
!git remote add origin https://Vibhu04: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@github.com/Vibhu04/PLM-Transfer-Learning-Toolkit.git

In [None]:
!git push origin master --force

Counting objects: 112, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (103/103), done.
Writing objects: 100% (112/112), 3.86 MiB | 2.91 MiB/s, done.
Total 112 (delta 15), reused 0 (delta 0)
remote: Resolving deltas: 100% (15/15), done.[K
To https://github.com/Vibhu04/PLM-Transfer-Learning-Toolkit.git
 + 9054834...c5584a6 master -> master (forced update)


In [None]:
!git pull origin master

remote: Enumerating objects: 58, done.[K
remote: Counting objects:   1% (1/58)[Kremote: Counting objects:   3% (2/58)[Kremote: Counting objects:   5% (3/58)[Kremote: Counting objects:   6% (4/58)[Kremote: Counting objects:   8% (5/58)[Kremote: Counting objects:  10% (6/58)[Kremote: Counting objects:  12% (7/58)[Kremote: Counting objects:  13% (8/58)[Kremote: Counting objects:  15% (9/58)[Kremote: Counting objects:  17% (10/58)[Kremote: Counting objects:  18% (11/58)[Kremote: Counting objects:  20% (12/58)[Kremote: Counting objects:  22% (13/58)[Kremote: Counting objects:  24% (14/58)[Kremote: Counting objects:  25% (15/58)[Kremote: Counting objects:  27% (16/58)[Kremote: Counting objects:  29% (17/58)[Kremote: Counting objects:  31% (18/58)[Kremote: Counting objects:  32% (19/58)[Kremote: Counting objects:  34% (20/58)[Kremote: Counting objects:  36% (21/58)[Kremote: Counting objects:  37% (22/58)[Kremote: Counting objects:  39% (23/58)[Kr