In [22]:
# %cd /home/rana/Projects/zindi
%cd /root/zindi/
import yaml
with open('common/config.yaml', 'r') as f:
    config = yaml.load(f, Loader=yaml.SafeLoader)

/root/zindi


In [23]:
checkpoint = config.get('checkpoint')
new_model_path=config.get('new_model_path')+checkpoint
ct_model_path=config.get('ct_model_path')+checkpoint

# model_checkpoint="/root/zindi/models/marian/marian_output/"+checkpoint
# model_checkpoint_ct="/root/zindi/models/marian/ct/"+checkpoint

In [24]:
import tqdm as notebook_tqdm
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from sacrebleu.metrics import BLEU
from tqdm import tqdm
import sentencepiece as spm
import ctranslate2

In [8]:
### Eval original model 

def translate(model, tokenizer, text, device):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=150).to(device)
    translated = model.generate(**inputs)
    return tokenizer.batch_decode(translated, skip_special_tokens=True)

def calculate_bleu(references, hypotheses):
    bleu = BLEU()
    return bleu.corpus_score(hypotheses, [references]).score

def validate_model(model_name, dataset_name, split):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

    # Load dataset
    dataset = load_dataset(dataset_name, split=split)

    # Translate and calculate BLEU score
    hypotheses = []
    references = []

    for batch in tqdm(dataset, desc="Translating"):
        source_text = batch['translation']['dyu']  # Adjust this based on your dataset's column names
        reference = batch['translation']['fr']  # Adjust this based on your dataset's column names
        
        translation = translate(model, tokenizer, source_text, device)[0]
        # print(reference)
        # print(translation)
        
        hypotheses.append(translation)
        references.append(reference)
        # break

    bleu_score = calculate_bleu(references, hypotheses)
    print(f"BLEU Score: {bleu_score:.2f}")

# Run validation with GPU
# validate_model(model_checkpoint, "uvci/Koumankan_mt_dyu_fr", use_gpu=True)

# Run validation without GPU
validate_model(new_model_path, "uvci/Koumankan_mt_dyu_fr", "validation")

Using device: cuda


Translating:   0%|          | 0/1471 [00:00<?, ?it/s]


ValueError: `decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation.

In [26]:
### Eval ct model

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

sp_source_model = spm.SentencePieceProcessor(model_file=ct_model_path+'/source.spm')
sp_target_model = spm.SentencePieceProcessor(model_file=ct_model_path+'/target.spm')


def translate(translator, text):
    tokens = sp_source_model.encode(text, out_type=str)
    # print(tokens)
    tokens = ['dyu'] + tokens + ["</s>"] +['fr']
    try:
        results = translator.translate_batch([tokens])
        # The translated results are token strings, so we need to convert them to IDs before decoding
        translations = []
        for translation in results:
            decoded_text = sp_target_model.decode(translation.hypotheses[0])
            translations.append(decoded_text)
    except Exception as e:
        print(f"Translation error: {e}")
        translations = [""]  # Return empty string if translation fails
    return translations

def calculate_bleu(references, hypotheses):
    bleu = BLEU()
    return bleu.corpus_score(hypotheses, [references]).score

def validate_model(model_path, dataset_name, split):
    # Load CTranslate2 model
    translator = ctranslate2.Translator(model_path, device=device)
    # Load dataset
    dataset = load_dataset(dataset_name, split=split)
    # Translate and calculate BLEU score
    hypotheses = []
    references = []

    for batch in tqdm(dataset, desc="Translating"):
        source_text = batch['translation']['dyu']  # Adjust this based on your dataset's column names
        reference = batch['translation']['fr']  # Adjust this based on your dataset's column names
        
        # translation = translate(translator, tokenizer, source_text, device)[0]
        # tokenizer=None
        translation = translate(translator, source_text)[0]
        hypotheses.append(translation)
        references.append(reference)
        # print(translation, "----", reference)
        # break

    bleu_score = calculate_bleu(references, hypotheses)
    print(f"BLEU Score: {bleu_score:.2f}")

validate_model(ct_model_path, "uvci/Koumankan_mt_dyu_fr", "train")

Using device: cuda


Translating: 100%|██████████| 8065/8065 [02:44<00:00, 49.08it/s]


BLEU Score: 82.40
