In [3]:
import torch
import json
import os
import pandas as pd
from pathlib import Path
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, MT5ForConditionalGeneration, T5TokenizerFast, AutoModelForSeq2SeqLM

In [4]:
from llm_research.inference import run_inference
from llm_research.evaluation import calculate_token_level_match_accuracy, calculate_exact_match_accuracy



In [5]:
from constants import(
    MODEL_MAPPING,
    EVAL_LANGUAGES
)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [6]:
BATCH_SIZE = 1
MAX_SEQ_LEN = 512
TEMPERATURE = 0.9
TOP_K = 50
TOP_P = 0.95
NUM_RETURN_SEQ = 1

In [7]:
EXCLUDE_TASKS = [
    'wikiann',
    # 'sib200',
    # 'xnli'
]

EXCLUDE_LANGS = [
    # 'ar',
    # 'de',
    # 'es',
    # 'el',
    # 'fr',
    # 'ru'
]

STORE_RESULTS = True


In [8]:
def call_run_inference(
    task,
    language,
    model_path,
    token_level_eval,
    generation_config
):
    dataset_test_path = Path(f"data/{task}/{task}_{language}_validation.json")
    pred_file_path = Path(f"{model_path.name}-{task}-{language}-predictions.json")

    if "lora" in str(model_path):
        is_peft = True
    else:
        is_peft = False

    run_inference(
        dataset_path=dataset_test_path,
        model_id=model_path,
        is_peft=is_peft,
        is_qlora=False,
        batch_size=BATCH_SIZE,
        generation_config=generation_config,
        out_path=pred_file_path,
    )

    if token_level_eval:
        #calculate token level accuracy
        exact_match_accuracy, mismatch_percentage = calculate_token_level_match_accuracy(
            ground_truth_file=dataset_test_path,
            prediction_file=pred_file_path,
        )
    else:
        # Compute exact match accuracy
        exact_match_accuracy = calculate_exact_match_accuracy(
            ground_truth_file=dataset_test_path,
            prediction_file=pred_file_path,
        )
        mismatch_percentage = None
    return exact_match_accuracy, mismatch_percentage

In [9]:
def run_task_lang_combos(
    model_path,
    tokenizer_path,
    generation_config,
    tasks = EVAL_LANGUAGES.keys(),
    exclude_langs = EXCLUDE_LANGS,
    results_dict_path = None,
):
    results_dict = {
        'model' : str(model_path),
        'tokenizer' : str(tokenizer_path)
    }

    results_dict['config'] = generation_config

    if results_dict_path is not None:
        with open(results_dict_path, 'w', encoding='utf-8') as f:
            json.dump(results_dict, f, indent=4)

    for task in tasks:
        if task == 'wikiann':
            token_level_eval = True
        else:
            token_level_eval = False
        
        if task in EXCLUDE_TASKS:
            print(f"Skipping: {task}")
            continue

        for language in EVAL_LANGUAGES[task]:
            if language in exclude_langs:
                print(f"Skipping language {language}")
                continue
            
            exact_match_accuracy, mismatch_count = call_run_inference(task=task, language=language, token_level_eval=token_level_eval, model_path=model_path, generation_config=generation_config)
            print(f"Mismatch count for {model_path.name} - {task} - {language} : {mismatch_count}")
            if results_dict_path is not None:
                with open(results_dict_path, 'r', encoding='utf-8') as f:
                    results_dict = json.load(f)

                if not task in results_dict.keys():
                    results_dict[task] = {}
                if not language in results_dict.keys():
                    results_dict[task][language] = {}

                results_dict[task][language]['token_level_eval'] = token_level_eval
                results_dict[task][language]['accuracy'] = exact_match_accuracy

                with open(results_dict_path, 'w', encoding='utf-8') as f:
                    json.dump(results_dict, f, indent=4)

            print(f"Accuracy for {model_path.name} - {task} - {language} : {exact_match_accuracy} (token_level = {token_level_eval})")

In [10]:
model_path = Path(MODEL_MAPPING['mt0-large'])
# model_path = Path("output/sib200/experiment_mt0-large_sib200_lora_ar")
tokenizer_path = model_path / "tokenizer_config.json"

tokenizer = T5TokenizerFast.from_pretrained(model_path)

generation_config = {
        "max_length": MAX_SEQ_LEN,
        "temperature": TEMPERATURE,
        "top_k": TOP_K,
        "top_p": TOP_P,
        "num_return_sequences": NUM_RETURN_SEQ,
        "eos_token_id": tokenizer.eos_token_id,
    }

In [11]:
results_dict_path = Path(f"{model_path.name}_accuracies_all_langs.json")

In [12]:
# run_task_lang_combos(model_path=model_path, tokenizer_path=tokenizer_path, generation_config=generation_config, results_dict_path=results_dict_path)