## Imports

In [1]:
import argparse
import logging
import os
import torch

import numpy as np
from datasets import load_from_disk
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    BitsAndBytesConfig,
    AutoModel,
    AutoConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


## Set up CometKiwi base model

In [2]:
from comet import download_model, load_from_checkpoint

# Import CometKiwi Model
model_path = download_model("Unbabel/wmt22-cometkiwi-da")
model = load_from_checkpoint(model_path)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

/fs/classhomes/fall2024/cmsc723/c7230021/miniconda3/lib/python3.12/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [3]:
print_trainable_parameters(model)

trainable params: 6296603 || all params: 565137435 || trainable%: 1.114171988978221


## Finetuning Multilingual Uncased Bert Model

In [4]:
from datasets import load_dataset
from datasets import Dataset
import pandas as pd

train_dataset = pd.read_csv('csv_data/train.csv')
#train_dataset = train_dataset.iloc[:6992] # EN to DE
train_dataset = train_dataset.iloc[6992:13964] # EN to ZH
#train_dataset = train_dataset.iloc[13964:20960] # ET to EN
#train_dataset = train_dataset.iloc[20960:27124] # NP to EN
#train_dataset = train_dataset.iloc[27124:33898] # RO to EN
#train_dataset = train_dataset.iloc[33898:] # RU to EN
train_dataset = Dataset.from_pandas(train_dataset)

validation_dataset = pd.read_csv('csv_data/validation.csv')
#validation_dataset = validation_dataset.iloc[:1000] # EN to DE
validation_dataset = validation_dataset.iloc[1000:1996] # EN to ZH
#validation_dataset = validation_dataset.iloc[1996:2995] # ET to EN
#validation_dataset = validation_dataset.iloc[2995:3937] # NP to EN
#validation_dataset = validation_dataset.iloc[3937:4842] # RO to EN
#validation_dataset = validation_dataset.iloc[4842:] # RU to EN
validation_dataset = Dataset.from_pandas(validation_dataset)

test_dataset = pd.read_csv('csv_data/test.csv')
#test_dataset = test_dataset.iloc[:998] # EN to DE
#test_dataset = test_dataset.iloc[998:1995] # EN to ZH
#test_dataset = test_dataset.iloc[1995:2994] # ET to EN
#test_dataset = test_dataset.iloc[2994:3993] # NP to EN
#test_dataset = test_dataset.iloc[3993:4993] # RO to EN
#test_dataset = test_dataset.iloc[4993:] # RU to EN
test_dataset = Dataset.from_pandas(test_dataset)

### Tokenize Dataloaders

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-uncased")

def tokenize_function(examples):
    return tokenizer(examples["original"], 
        examples["translation"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
validation_dataset = validation_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
print('Vocab size: ',tokenizer.vocab_size)

Map:   0%|          | 0/6972 [00:00<?, ? examples/s]

Map:   0%|          | 0/996 [00:00<?, ? examples/s]

Map:   0%|          | 0/5993 [00:00<?, ? examples/s]

Vocab size:  105879


In [6]:
# Remove irrelvant columns
train_dataset = train_dataset.remove_columns(['Unnamed: 0','original','translation'])
validation_dataset = validation_dataset.remove_columns(['Unnamed: 0','original','translation'])
test_dataset = test_dataset.remove_columns(['Unnamed: 0','original','translation'])

# Rename column names to correct format
train_dataset = train_dataset.rename_column("mean", "labels")
validation_dataset = validation_dataset.rename_column("mean", "labels")
test_dataset = test_dataset.rename_column("mean", "labels")

# Format lists to torch tensors
train_dataset.set_format("torch")
validation_dataset.set_format("torch")
test_dataset.set_format("torch")

# small subset of entire training data for testing the training process

train_dataset = train_dataset.shuffle(seed=42).select(range(1000))
validation_dataset = validation_dataset.shuffle(seed=42).select(range(100))
#test_dataset = test_dataset.shuffle(seed=42).select(range(100))

### Create Torch Dataloaders

In [7]:
from torch.utils.data import DataLoader
batch_size = 16

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
eval_dataloader = DataLoader(validation_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

### Load Bert Model

In [8]:
from transformers import AutoModelForSequenceClassification

# bnb_config sets up Quantization
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
)
# lora_config sets up LoRA
lora_config = LoraConfig(
    r=16, 
    target_modules = ['query','key','value'],
    inference_mode=False
)

#bert_model = torch.load("finetuned_Bert.pth", weights_only=False)
bert_model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-multilingual-uncased", num_labels=1, quantization_config=bnb_config)
bert_model.gradient_checkpointing_enable()
print_trainable_parameters(bert_model)

# ADDS Q
bert_model = prepare_model_for_kbit_training(bert_model)

# ADDS LORA
bert_model = get_peft_model(bert_model, lora_config)

print_trainable_parameters(bert_model)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
`low_cpu_mem_usage` was None, now default to True since model is quantized.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 81748993 || all params: 167357185 || trainable%: 48.84701723442588
trainable params: 884736 || all params: 168241921 || trainable%: 0.5258713136067913


In [9]:
from torch.optim import AdamW

optimizer = AdamW(bert_model.parameters(), lr=1e-4)

In [10]:
from transformers import get_scheduler

num_epochs = 20
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [11]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
bert_model.to(device);

### Bert Model Training Loop

In [13]:
from tqdm.auto import tqdm
import csv

with open('log.csv', 'w', newline='') as f:
   writer = csv.writer(f)
   writer.writerow(['epoch', 'train_loss', 'valid_loss'])

for epoch in range(num_epochs):
    train_losses = []
    valid_losses = []
    
    bert_model.train()
    progress_bar = tqdm(range(len(train_dataloader)+len(eval_dataloader)))
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = bert_model(**batch)
        loss = outputs.loss
        train_losses.append(loss)
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    bert_model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = bert_model(**batch)
            loss = outputs.loss
            valid_losses.append(loss)
            progress_bar.update(1)
            
    train_loss = sum(train_losses)/len(train_losses)
    valid_loss = sum(valid_losses)/len(valid_losses)
    print(f'epoch {epoch+1}: training loss: {train_loss} validation loss: {valid_loss}')
    with open('log.csv', 'a', newline='') as f:
       writer = csv.writer(f)
       writer.writerow([epoch+1, train_loss.item(), valid_loss.item()])
        

  0%|          | 0/70 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [21]:
# save model
torch.save(bert_model, 'Bert_QLoRA_EN_ZH.pth')

### Plot train and validation loss graph

In [14]:
import pandas as pd
import matplotlib.pyplot as plt
train_log = pd.read_csv('log.csv')
train_log = train_log[['train_loss','valid_loss']]
train_log.plot()

TypeError: no numeric data to plot

## Run both models on `test.csv`

### CometKiwi model

In [61]:
test_data = pd.read_csv('csv_data/test.csv')
test_data["src"] = test_data["original"]
test_data["mt"] = test_data["translation"]
test_data = list(test_data[["src", "mt"]].iloc[i].to_dict() for i in range(test_data.shape[0]))
result = model.predict(test_data, batch_size=batch_size, gpus=1);
comet_kiwi_test = torch.Tensor(result.scores)

/fs/classhomes/fall2024/cmsc723/c7230021/miniconda3/lib/python3.12/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /fs/classhomes/fall2024/cmsc723/c7230021/miniconda3/ ...
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenize

### Bert Model

In [62]:
result = []
bert_model.eval()
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        result.append(bert_model(**batch))
bert_test = torch.cat([i.logits for i in result]).T.squeeze().cpu()



In [81]:
mean_squared_test_error = ((bert_test - comet_kiwi_test) ** 2).mean()