## Imports

In [1]:
import argparse
import logging
import os
import torch

import numpy as np
from datasets import load_from_disk
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    BitsAndBytesConfig,
    AutoModel,
    AutoConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


## Set up CometKiwi base model

In [2]:
from comet import download_model, load_from_checkpoint

# Import CometKiwi Model
model_path = download_model("Unbabel/wmt22-cometkiwi-da")
model = load_from_checkpoint(model_path)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.2 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../.cache/huggingface/hub/models--Unbabel--wmt22-cometkiwi-da/snapshots/b3a8aea5a5fc22db68a554b92b3d96eb6ea75cc9/checkpoints/model.ckpt`
Encoder model frozen.
/home/neko/miniconda3/envs/723project/lib/python3.8/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [3]:
print_trainable_parameters(model)

trainable params: 6296603 || all params: 565137435 || trainable%: 1.114171988978221


## Finetuning distillbert model

In [19]:
from datasets import load_dataset
from datasets import Dataset
import pandas as pd

df = pd.read_csv("task_1/en-gu/train.engu.df.short.tsv", sep='\t')
df = df.iloc[:,1:]
train_dataset = Dataset.from_pandas(df)

df = pd.read_csv("task_1/en-gu/dev.engu.df.short.tsv", sep='\t')
df = df.iloc[:,1:]
test_dataset = Dataset.from_pandas(df)

In [33]:
train_dataset['original'][1]

'Fortunately, no one sustained injuries in these incidents.'

In [25]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["original"], padding="max_length", truncation=True)

tokenized_datasets = train_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

In [34]:
tokenized_datasets['original'][1]

'Fortunately, no one sustained injuries in these incidents.'

## Set up data for predictions

In [6]:
import pandas as pd

mt_path = 'test_data_2023/task1_sentence_level/en-de/test.ende.final.mt'
src_path = 'test_data_2023/task1_sentence_level/en-de/test.ende.final.src'
with open(mt_path, 'r') as f:
    mt = f.read().splitlines()
    f.close()
with open(mt_path, 'r') as f:
    src = f.read().splitlines()
    f.close

data = []
if len(mt) == len(src):
    for (m,s) in list(zip(mt,src)):
        d = {}
        d['mt'] = m
        d['src'] = s
        data.append(d)
else:
    print('length of mt and src do not match')

In [9]:
base_model_output = model.predict(data, batch_size=8, gpus=1)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|████████████████████████████████████████████████████████| 238/238 [00:11<00:00, 19.94it/s]


In [None]:
base_model_output

## Apply LoRa to model

In [11]:
lora_model = prepare_model_for_kbit_training(model)

In [14]:
config = LoraConfig(
        r=16, 
        target_modules = ['query','key','value']
    )

lora_model = get_peft_model(lora_model, config)

In [15]:
print_trainable_parameters(lora_model)

trainable params: 2359296 || all params: 567496731 || trainable%: 0.4157373727673508


In [16]:
lora_model_output = model.predict(data, batch_size=8, gpus=1)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|████████████████████████████████████████████████████████| 238/238 [00:12<00:00, 18.80it/s]


In [22]:
diff_sum = 0
for (b,l) in list(zip(base_model_output['scores'], lora_model_output['scores'])):
    diff_sum += (b-l)
diff_sum

0.0