In [1]:
import os
import torch
import random
import gc
import logging
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from managers import SystemMonitor



In [2]:
torch.cuda.empty_cache()
gc.collect()

28

In [3]:
env_vars = {
    "CUDA_VISIBLE_DEVICES": "0",
    "TRANSFORMERS_NO_ADVISORY_WARNINGS": "true",
    "TORCHDYNAMO_DISABLE": "1",
    "TOKENIZERS_PARALLELISM": "false",
}
os.environ.update(env_vars)

In [4]:
torch.manual_seed(100)
random.seed(100)
np.random.seed(100)

In [5]:
monitor = SystemMonitor()
f"Baseline usage: {monitor.get_gpu_utilization()} GB of GPU"

'Baseline usage: 0 GB of GPU'

In [6]:
# Configurations
class Configuration:
    def __init__(self, **kwargs):
        self.keep_fraction = kwargs.get("keep_fraction", 0.9)
        self.test_fraction = kwargs.get("test_fraction", 0.2)
        self.scratch_path = kwargs.get("scratch_path", "/scratch/vgn2004")
        self.dataset_path = kwargs.get(
            "dataset_path",
            os.path.join(
                self.scratch_path, "fine_tuning", "datasets", "disaster_tweets.csv"
            ),
        )
        self.num_workers = kwargs.get("num_workers", 16)
        self.num_virtual_tokens = kwargs.get("num_virtual_tokens", 16)
        self.batch_size = kwargs.get("batch_size", 16)
        self.lr = kwargs.get("lr", 3e-4)
        self.num_epochs = kwargs.get("num_epochs", 5)
        self.max_length = kwargs.get("max_length", 128)
        self.device = kwargs.get("device", "cuda")

        self.model_name_or_path = kwargs.get("model_name_or_path", "facebook/opt-1.3b")

In [7]:
config = Configuration()
config.keep_fraction

0.9

In [8]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    default_data_collator,
    get_linear_schedule_with_warmup,
)
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    PromptTuningInit,
    PromptTuningConfig,
    TaskType,
)

tokenizer = AutoTokenizer.from_pretrained(config.model_name_or_path)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(
    config.model_name_or_path, trust_remote_code=True
)
peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=config.num_virtual_tokens,
    prompt_tuning_init_text="Classify if the Tweet is about a natural disaster or not:",
    tokenizer_name_or_path=config.model_name_or_path,
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 16,384 || all params: 1,315,774,464 || trainable%: 0.0012451982044241893


In [9]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files=config.dataset_path)
dataset = dataset["train"].train_test_split(test_size=config.test_fraction)

In [10]:
def preprocess_function(examples):
    batch_size = len(examples["text"])
    cls = {0: "normal", 1: "disaster"}
    inputs = [f"Tweet : {x} Label : " for x in examples["text"]]
    targets = [cls[x] for x in examples["target"]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets)

    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.pad_token_id]
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])

    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            config.max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (
            config.max_length - len(sample_input_ids)
        ) + model_inputs["attention_mask"][i]
        labels["input_ids"][i] = [-100] * (
            config.max_length - len(sample_input_ids)
        ) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(
            model_inputs["input_ids"][i][: config.max_length]
        )
        model_inputs["attention_mask"][i] = torch.tensor(
            model_inputs["attention_mask"][i][: config.max_length]
        )
        labels["input_ids"][i] = torch.tensor(
            labels["input_ids"][i][: config.max_length]
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=config.num_workers,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

training_dataloader = torch.utils.data.DataLoader(
    processed_datasets["train"],
    sampler=torch.utils.data.RandomSampler(processed_datasets["train"]),
    batch_size=config.batch_size,
    num_workers=config.num_workers,
    collate_fn=default_data_collator,
    pin_memory=True,
)
validation_dataloader = torch.utils.data.DataLoader(
    processed_datasets["test"],
    sampler=torch.utils.data.SequentialSampler(processed_datasets["test"]),
    batch_size=config.batch_size,
    num_workers=config.num_workers,
    collate_fn=default_data_collator,
    pin_memory=True,
)

Running tokenizer on dataset (num_proc=16):   0%|          | 0/6090 [00:00<?, ? examples/s]

Running tokenizer on dataset (num_proc=16):   0%|          | 0/1523 [00:00<?, ? examples/s]



In [11]:
optimizer = torch.optim.AdamW(model.parameters(), lr=config.lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(training_dataloader) * config.num_epochs),
)

In [15]:
model.to(config.device)
for epoch in range(config.num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(training_dataloader)):
        batch = {k: v.to(config.device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(validation_dataloader)):
        batch = {k: v.to(config.device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(
                torch.argmax(outputs.logits, -1).detach().cpu().numpy(),
                skip_special_tokens=True,
            )
        )

    eval_epoch_loss = eval_loss / len(validation_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(training_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

100%|██████████| 381/381 [06:49<00:00,  1.07s/it]
100%|██████████| 96/96 [00:50<00:00,  1.91it/s]


epoch=0: train_ppl=tensor(1820.6173, device='cuda:0') train_epoch_loss=tensor(7.5069, device='cuda:0') eval_ppl=tensor(11.6449, device='cuda:0') eval_epoch_loss=tensor(2.4549, device='cuda:0')


100%|██████████| 381/381 [06:43<00:00,  1.06s/it]
100%|██████████| 96/96 [00:50<00:00,  1.92it/s]


epoch=1: train_ppl=tensor(12.8489, device='cuda:0') train_epoch_loss=tensor(2.5533, device='cuda:0') eval_ppl=tensor(5.1494, device='cuda:0') eval_epoch_loss=tensor(1.6389, device='cuda:0')


100%|██████████| 381/381 [06:43<00:00,  1.06s/it]
100%|██████████| 96/96 [00:50<00:00,  1.91it/s]


epoch=2: train_ppl=tensor(6.6534, device='cuda:0') train_epoch_loss=tensor(1.8951, device='cuda:0') eval_ppl=tensor(4.4703, device='cuda:0') eval_epoch_loss=tensor(1.4975, device='cuda:0')


100%|██████████| 381/381 [06:43<00:00,  1.06s/it]
100%|██████████| 96/96 [00:50<00:00,  1.90it/s]


epoch=3: train_ppl=tensor(5.6010, device='cuda:0') train_epoch_loss=tensor(1.7230, device='cuda:0') eval_ppl=tensor(4.0246, device='cuda:0') eval_epoch_loss=tensor(1.3924, device='cuda:0')


100%|██████████| 381/381 [06:43<00:00,  1.06s/it]
100%|██████████| 96/96 [00:50<00:00,  1.91it/s]

epoch=4: train_ppl=tensor(5.2083, device='cuda:0') train_epoch_loss=tensor(1.6503, device='cuda:0') eval_ppl=tensor(3.9274, device='cuda:0') eval_epoch_loss=tensor(1.3680, device='cuda:0')





In [22]:
inputs = tokenizer(
    f'Tweet : {"Wreckage everywhere, we are fucked"} Label : ',
    return_tensors="pt",
)
model.to(config.device)

with torch.inference_mode():
    inputs = {k: v.to(config.device) for k, v in inputs.items()}
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=10,
        eos_token_id=3,
    )
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy()))

['</s>Tweet : Wreckage everywhere, we are fucked Label : </s>normal<pad><pad><pad><pad><pad><pad><pad><pad>']
