### Research \& Development Project
___
This is the step-by-step guide to what I did during the project. Following these steps will provide the same results
at which I arrived.<br>
#### Part 1: Setup
* __Step 1.1__: Setup necessary parameters, create the virtual environment, install necessary packages.<br>
Once this is done, set the kernel to said environment.<br>
For this to work, follow the instructions in the README.txt

* __Step 1.2__: Load all libraries that will be used.

In [7]:
from transformers import (AutoModelForSequenceClassification as amfsc, 
                         AutoTokenizer as at, 
                         AutoModelForCausalLM)
from accelerate import Accelerator
import ctranslate2
import sentencepiece as spm
import sacrebleu
import torch
import sys
import os

* __Step 1.3__: Retrieve and convert OpenNMT's NLLB transformer.

In [None]:
# Retrieve the model from https://www.opennmt.net/Models-py/
!wget https://s3.amazonaws.com/opennmt-models/nllb-200/nllb-200-3.3B-onmt.pt
!wget https://s3.amazonaws.com/opennmt-models/nllb-200/flores200_sacrebleu_tokenizer_spm.model
!mkdir nmt_resources
!mv nllb-200-3.3B-onmt.pt flores200_sacrebleu_tokenizer_spm.model nmt_resources/
!ct2-opennmt-py-converter --model nmt_resources/nllb-200-3.3B-onmt.pt --quantization int8 --output_dir nmt_resources/nllb-200-3.3B-int8

In [35]:
!sbatch execute.sh

Submitted batch job 594751 on cluster pelle


In [None]:
#TEMPORARY CELL! Remove before handing in project!
!squeue -u mame0175

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            594751       gpu execute. mame0175 PD       0:00      1 (Priority)


#### Part 2: Translation
___
The following chunks of code will load the models, translate the source text, and store the translated target text.
* __Step 2.1__: Load NMT.

In [None]:
#--------------------Part 2.1: LOAD NMT--------------------#
''' Test for GPU and load translator '''

# 2.1.0: Set device to GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 2.1.1: Instantiate the translator
ct_model_path = 'nmt_resources/nllb-200-3.3B-int8'
sp_model_path = 'nmt_resources/flores200_sacrebleu_tokenizer_spm.model'

sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)

translator = ctranslate2.Translator(ct_model_path, device)

print(device)
print(ctranslate2.__version__)

* __Step 2.2__: Load LLM.

In [None]:
#--------------------Part 2.2: LOAD LLM--------------------#
''' Test for GPU and load the model '''

# 2.2.0: Set device to GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 2.2.1: Instantiate tokenizer and model
tokenizer = at.from_pretrained('meta-llama/Llama-2-13b-chat-hf')
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    'meta-llama/Llama-2-13b-hf',
    device_map='auto',  # sets to device='cuda' if eligible
    dtype=torch.float16
)

resrv_mem = torch.cuda.memory.memory_reserved(0)
alloc_mem = torch.cuda.memory.memory_allocated(0)
print(f"\n Reserved memory: {resrv_mem / 1024**3:.2f} GB\n",
      f"Allocated memory: {alloc_mem / 1024**3:.2f} GB\n",
      f"Unused memory: {(resrv_mem - alloc_mem) / 1024**3:.2f} GB\n")

* __Part 2.3__: Translate source text using the NMT.

In [None]:
#--------------------PARt 2.3: NMT--------------------#
''' Encode input, translate source text, decode output '''

# 2.3.0 Define source and target languages
src_lang = 'eng_Latn'
tgt_lang = 'deu_Latn'

# 2.3.1: Define paths
inp_path = 'paraphrases/eng_Latn.txt'
out_path = 'translations/nmt_Latn.de'

beam_size = 4

# 2.3.2: Open and read input file; return lines
with open(inp_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()

# 2.3.3: Define source and target prefixes
source_sents = [line.strip() for line in lines]
print(src_lang, source_sents[0], sep=' --> ')
target_prefix = [[tgt_lang]] * len(source_sents)

# 2.3.4: Subword source sentences
source_sents_subword = sp.encode_as_pieces(source_sents)
source_sents_subword = [[src_lang] + sent + ['</s>'] for sent in source_sents_subword]

# 2.3.5: Translate source sentences
translator = ctranslate2.Translator(ct_model_path, device=device)
translations = translator.translate_batch(source_sents_subword, 
                                          batch_type='tokens',
                                          max_batch_size=2048,
                                          beam_size=beam_size,
                                          target_prefix=target_prefix)
translations = [translation.hypotheses[0] for translation in translations]

# 2.3.6: Desubword target sentences
translations_desubword = sp.decode(translations)
translations_desubword = [sent[len(tgt_lang):].strip() for sent in translations_desubword]
print(tgt_lang, translations_desubword[0], sep=' --> ')

# 2.3.7: Create target directory
try:
    os.mkdir('translations')
except FileExistsError:
    print('Directory already exists. Proceeds as normal.')
    pass

# 2.3.8: Write translation to output file
with open(out_path, 'w', encoding='utf-8') as f:
    for line in translations_desubword:
        f.write(line.strip() + '\n')
print(f'Translations successfully executed and saved to: {out_path}.')

* __Part 2.4__: Translate the source text using the LLM.

In [None]:
#--------------------PART 2.4: LLM--------------------#
''' Formalize prompt, set temperature, translate source text '''

# 2.4.0 Define source and target languages
src_lang = 'English'
tgt_lang = 'German'

# 2.4.1: Define paths
inp_path = 'paraphrases/eng_Latn.txt'
out_path = f'translations/llm_t{sys.argv[2]}_Latn.de'

# 2.4.2 Load source text
with open(inp_path, 'r', encoding='utf-8') as f:
    lines = [line.strip() for line in f]

# 2.4.3: Create target directory
try:
    os.mkdir('translations')
except FileExistsError:
    print('Directory already exists. Proceeds as normal.')
    pass

# 2.4.4: Set batch size and provide prompt to model
batch_size = 1     # 10 was chosen through testing different batch sizes, in which higher numbers failed to be translated
with open(out_path, 'w', encoding='utf-8') as f:
    for i in range(0, 100, batch_size):
        batch = ''
        for line in lines[i:i+batch_size]:
            batch += line + '\n'
        prompt = f'''
                  Translate the following text from {src_lang} to {tgt_lang}.
                  Output exactly the same number of lines as the input.
                  Output only the translation, with no additional text.
                  INPUT:
                  {batch}
                  OUTPUT:
                  '''

        # 2.4.5: Set input parameters
        inputs = tokenizer(
            prompt,
            return_tensors='pt',
            padding=True
        )

        # 2.4.6: Move model and inputs to device before generating
        device = next(model.parameters()).device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # 2.4.7: Set output parameters; temperature changes for each output
        outputs = model.generate(**inputs, 
                                 max_new_tokens=1024,
                                 temperature=float(sys.argv[1])
        )

        # 2.4.8: Decode output and generate translation
        translation = tokenizer.decode(
            outputs[0][inputs['input_ids'].shape[-1]:], 
            skip_special_tokens=True
        ).strip()

        # 2.4.9: Write to file
        f.write(translation + '\n')
print(f'Translations successfully executed and saved to: {out_path}.')

#### Part 3: Evaluate
___
The following chunks of code produce multi-reference BLEU score, and runs a Reward Model with the NMT & LLM translations as input preference-pair.
* __Part 3.1__: Setup the BLEU score script.

In [None]:
# 3.1.0: Set variables
prfx = sys.argv[1]
temp = sys.argv[2]
split = f'{prfx}_{temp}'
cand_path = f'translations/{split}Latn.de'
refs_path = 'paraphrases/'
out_dir = f'scores/'

# 3.1.1: Load candidate text
with open(cand_path, 'r', encoding='utf-8') as f:
    cand = [line for line in f]

# 3.1.2: Load reference texts
refs = list()
for file in os.listdir(refs_path):
    if file.startswith('deu_Latn'):
        with open(os.path.join(refs_path, file), 'r', encoding='utf-8') as f:
            refs.append([line for line in f])

try:
    os.mkdir(out_dir)
except FileExistsError:
    print('Directory already exists. Proceeds as normal.')
    pass

# 3.1.3: Store the multi-reference BLEU score
with open(f'{out_dir}/{split.upper()}scores.txt', 'w', encoding='utf-8') as f:
    for i, r in enumerate(refs):
        ind_score = sacrebleu.corpus_bleu(cand, [r])
        f.write(f'BLEU score {i+1}: {ind_score}\n')
    tot_score = sacrebleu.corpus_bleu(cand, refs)
    f.write(f'Total BLEU score: {tot_score}')

print(f'\nBLEU score successfully computed!\nCheck "{split.upper()}score.txt" for results.')

* __Part 3.2__: Load the Reward Model.

In [None]:
# WIP!!!!
accelerator = Accelerator()
device = accelerator.device
model_name = 'Skywork/Skywork-Reward-V2-Llama-3.1-8B'
rm = amfsc.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map=device,
    attn_implementation='flash_attention_2',
    num_labels=1
)
tokenizer = at.from_pretrained(model_name)

* __Part 3.3__: Execute the Reward Model and save preferences.

In [None]:
pass