In [1]:
import os
import torch
import random
import gc
import bitsandbytes
import logging
import numpy as np
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from managers import SystemMonitor



In [2]:
torch.cuda.empty_cache()
gc.collect()

28

In [3]:
env_vars = {
    "CUDA_VISIBLE_DEVICES": "0",
    "TRANSFORMERS_NO_ADVISORY_WARNINGS": "true",
    "TORCHDYNAMO_DISABLE": "1",
    "TOKENIZERS_PARALLELISM": "false",
}
os.environ.update(env_vars)

In [4]:
torch.manual_seed(100)
random.seed(100)
np.random.seed(100)

In [5]:
monitor = SystemMonitor()
f"Baseline usage: {monitor.get_gpu_utilization()} GB of GPU"

'Baseline usage: 0 GB of GPU'

In [6]:
# Configurations
class Configuration:
    def __init__(self, **kwargs):
        self.keep_fraction = kwargs.get("keep_fraction", 0.99)
        self.test_fraction = kwargs.get("test_fraction", 0.2)
        self.scratch_path = kwargs.get("scratch_path", "/scratch/vgn2004")
        self.dataset_path = kwargs.get("dataset_path", os.path.join(
            self.scratch_path, "fine_tuning", "datasets", "disaster_tweets.csv"
        ))
        self.num_workers = kwargs.get("num_workers", 14)
        self.num_virtual_tokens = kwargs.get("num_virtual_tokens", 16)
        self.batch_size = kwargs.get("batch_size", 128)
        self.lr = kwargs.get("lr", 3e-4)
        self.num_epochs = kwargs.get("num_epochs", 5)
        self.max_length = kwargs.get("max_length", 128)
        self.device = kwargs.get("device", "cuda")
        
        self.model_name_or_path = kwargs.get("model_name_or_path", "NousResearch/Llama-2-7b-hf")
        
        self.r = kwargs.get("r", 64)
        self.lora_alpha = kwargs.get("lora_alpha", 128)
        self.lora_dropout = kwargs.get("lora_dropout", 0.2)
        self.lora_bias = kwargs.get("lora_bias", "none")
        self.is_gradient_checkpointing_enabled = kwargs.get("is_gradient_checkpointing_enabled", True)
        
        self.is_quantized = kwargs.get("is_quantized", False)

config = Configuration() #model_name_or_path="facebook/opt-1.3b")
config.keep_fraction

0.99

In [7]:
from transformers import LlamaForSequenceClassification, LlamaTokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, get_linear_schedule_with_warmup, DataCollatorWithPadding
from transformers import BitsAndBytesConfig
from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, prepare_model_for_kbit_training, PromptTuningInit, PromptTuningConfig, LoraConfig, TaskType
if 'LLama' in config.model_name_or_path:
    tokenizer = LlamaTokenizer.from_pretrained(config.model_name_or_path)
    tokenizer.padding_side = 'right'
    tokenizer.model_max_length = config.max_length
    tokenizer.pad_token = tokenizer.unk_token
else:
    tokenizer = AutoTokenizer.from_pretrained(config.model_name_or_path)
    tokenizer.padding_side = 'left'
    tokenizer.pad_token = tokenizer.eos_token

In [8]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="bfloat16",
    bnb_4bit_use_double_quant=True
)

if config.is_quantized:
    if 'LLama' in config.model_name_or_path:
        model = LlamaForSequenceClassification.from_pretrained(
            config.model_name_or_path,
            device_map="auto",
            quantization_config=quantization_config
        )
    else:
        model = AutoModelForSequenceClassification.from_pretrained(
            config.model_name_or_path,
            device_map="auto",
            quantization_config=quantization_config
        )
else:
    if 'LLama' in config.model_name_or_path:
        model = LlamaForSequenceClassification.from_pretrained(
            config.model_name_or_path
        )
    else:
        model = AutoModelForSequenceClassification.from_pretrained(
            config.model_name_or_path
        )
    
model.config.pad_token_id = tokenizer.pad_token_id

if config.is_gradient_checkpointing_enabled:
    model.gradient_checkpointing_enable()
    model.enable_input_require_grads()

model.config.use_cache = False
model.config.pretraining_tp = 1


def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

peft_config = LoraConfig(
    target_modules = find_all_linear_names(model),
    task_type=TaskType.SEQ_CLS, 
    inference_mode=False, 
    r=config.r, 
    lora_alpha=config.lora_alpha,
    lora_dropout=config.lora_dropout,
    bias=config.lora_bias
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at NousResearch/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 33,570,816 || all params: 6,640,914,432 || trainable%: 0.5055149609854211


In [9]:
model.config

LlamaConfig {
  "_name_or_path": "NousResearch/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.33.2",
  "use_cache": false,
  "vocab_size": 32000
}

In [10]:
dataset = load_dataset('csv', data_files=config.dataset_path)
dataset = dataset['train'].train_test_split(test_size=config.test_fraction)
dataset['train'][0]

{'id': 9941,
 'keyword': 'trouble',
 'location': None,
 'text': "The worst  voice I can ever hear is the 'Nikki your in trouble' voice from my mom",
 'target': 0}

In [11]:
def preprocess_function(examples):
    model_inputs = tokenizer(examples['text'], max_length=config.max_length, truncation=True)
    model_inputs["labels"] = examples['target']
    return model_inputs

processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=config.num_workers,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset (num_proc=14):   0%|          | 0/6090 [00:00<?, ? examples/s]

Running tokenizer on dataset (num_proc=14):   0%|          | 0/1523 [00:00<?, ? examples/s]

In [12]:
data = processed_datasets['train'][33]
ids = data['input_ids']
print(len(ids))
tokenizer.decode(ids, skip_special_tokens=True), data['labels']

33


('A sinkhole grows in Brooklyn: six-meter crater swallows street http://t.co/gkPrvzQ6lk',
 1)

In [13]:
training_dataloader = torch.utils.data.DataLoader(processed_datasets['train'], sampler=torch.utils.data.RandomSampler(processed_datasets['train']), batch_size=config.batch_size, num_workers=config.num_workers, collate_fn=DataCollatorWithPadding(tokenizer=tokenizer, padding="longest"), pin_memory=True)
validation_dataloader = torch.utils.data.DataLoader(processed_datasets['test'], sampler=torch.utils.data.SequentialSampler(processed_datasets['test']), batch_size=config.batch_size, num_workers=config.num_workers, collate_fn=DataCollatorWithPadding(tokenizer=tokenizer, padding="longest"), pin_memory=True)

In [14]:
optimizer = torch.optim.AdamW(model.parameters(), lr=config.lr) if not config.is_quantized else bitsandbytes.optim.AdamW(model.parameters(), lr=config.lr, is_paged=True, optim_bits=8)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(training_dataloader) * config.num_epochs),
)

In [15]:
# Function to calculate metrics
def calculate_metrics(preds, labels):
    precision = precision_score(labels, preds, average='macro')
    recall = recall_score(labels, preds, average='macro')
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='macro')
    return precision, recall, accuracy, f1

# Evaluate a dataloader
def evaluate(dataloader):
    model.eval()
    all_preds = []
    all_labels = []

    eval_loss = 0.0
    with torch.no_grad():
        for data in tqdm(dataloader):
            batch = {k: v.to(config.device) for k, v in data.items()}
            outputs = model(**batch)
            loss = outputs.loss
            eval_loss += loss.detach().float()
            preds = torch.argmax(torch.softmax(outputs.logits, dim=1), dim=1)
            labels = batch['labels']

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    precision, recall, accuracy, f1 = calculate_metrics(all_preds, all_labels)
    return precision, recall, accuracy, f1, eval_loss

In [None]:
if not config.is_quantized:
    model.to(config.device)

exit = False
for epoch in range(config.num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(training_dataloader)):
        if epoch==0 and step < 5:
            print(f"Usage: {monitor.get_gpu_utilization()} GB of GPU")
        optimizer.zero_grad()
        batch = {k: v.to(config.device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        if torch.isnan(loss):
            print(f"NaN loss detected at Epoch {epoch}, Step {step}")
            exit = True
            break
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

    if exit:
        break

    model.eval()
    precision_val, recall_val, accuracy_val, f1_val, eval_loss = evaluate(validation_dataloader)
    print(f"Validation Data - Precision: {precision_val}, Recall: {recall_val}, Accuracy: {accuracy_val}, F1: {f1_val}")
    eval_epoch_loss = eval_loss / len(validation_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(training_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

  0%|          | 0/48 [00:00<?, ?it/s]

Usage: 25 GB of GPU


  2%|▏         | 1/48 [00:35<27:37, 35.27s/it]

Usage: 38 GB of GPU


  4%|▍         | 2/48 [01:06<25:16, 32.97s/it]

Usage: 38 GB of GPU


  6%|▋         | 3/48 [01:36<23:39, 31.55s/it]

Usage: 38 GB of GPU


  8%|▊         | 4/48 [02:04<22:10, 30.24s/it]

Usage: 38 GB of GPU


 98%|█████████▊| 47/48 [26:49<00:35, 35.68s/it]

In [None]:
#experiments 128 sequence length
#LLama 7b - 83% accuracy, 42 GB on GPU, 6? minutes per epoch, 2 epochs
#OPT 1.3b - 68% accuracy, 12 GB on GPU, 2.5 minutes per epoch, 2 epochs
#OPT 1.3b - 77% accuracy, 12 GB on GPU, 2.5 minutes per epoch, 10 epochs, 16 batch size,  3e-3
#OPT 1.3b - 72% accuracy, 26 GB on GPU, 3 minutes per epoch, 2 epochs, 64 batch size, 3e-3
#OPT 1.3b - 83.8% accuracy, 26 GB on GPU, 3 minutes per epoch, 10 epochs, 64 batch size, 3e-4,
#OPT 1.3b - 79.9% accuracy,  21 GB on GPU, 3 minutes per epoch, 2 epochs, 64 batch size, 3e-4, BIGGER r + alpha
#OPT 1.3b - 79.18% accuracy, 21 GB on GPU, 3 minutes per epoch, 2 epochs, 64 batch size, 6e-4, BIGGER r + alpha
#OPT 1.3b - 83% accuracy, 12 GB on GPU, 3 minutes per epoch, 5 epochs, 32 batch size, 3e-4, r=64, alpha = 128, dropout=0.2
#OPT 1.3b - 83% accuracy, 6 GB on GPU, 4.25 minutes per epoch, 5 epochs, 32 batch size, 3e-4, r=64, alpha = 128, dropout=0.2, grad checkpointing
#OPT 1.3b - 83.9% accuracy, 2 GB on GPU, 6.5 minutes per epoch, 5 epochs, 32 batch size, 3e-4, r=64, alpha = 128, dropout=0.2, grad checkpointing, quantized-4-bit
#OPT 1.3b - 83.9% accuracy, 4 GB on GPU, 7 minutes per epoch, 5 epochs, 128 batch size, 3e-4, r=64, alpha = 128, dropout=0.2, grad checkpointing, quantized-4-bit

#Llama 7B with best settings from above -  accuracy, 6 GB on GPU, 40 minutes per epoch
#Llama 7B with best settings from above -  accuracy,  GB on GPU, 40 minutes per epoch

#Llama 30B with best settings from above -  accuracy,  GB on GPU,   minutes per epoch

# Implement Llama - 
# NousResearch/Llama-2-13b-hf
# NousResearch/Llama-2-70b-hf
# Prompting inference on instruct-tuned llamma
 

#OPT 1.3b -  accuracy,  GB on GPU,   minutes per epoch, base prompting


# with torch.no_grad():
#     for i, data in enumerate(training_dataloader):
#         batch = {k: v.to(config.device) for k, v in data.items()}
#         print([(t[0].item(),t[1].item()) for t  in list(zip(torch.argmax(torch.softmax(model(**batch).logits, dim=1),dim=1), batch['labels']))])
#         if i>20:
#             break

In [None]:
# Calculating metrics for training data
# precision_train, recall_train, accuracy_train, f1_train = evaluate(training_dataloader)
# print(f"Training Data - Precision: {precision_train}, Recall: {recall_train}, Accuracy: {accuracy_train}, F1: {f1_train}")

# Calculating metrics for validation data