#Step 1: Install Required Libraries

In [2]:
!pip install transformers

from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from transformers import MarianMTModel, MarianTokenizer
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

# Load the first 100 sentences of the script (replace with your own dataset)
script = """
Swinging off branches, playing in valleys

I should be coddled in mother's lap everyday

"The nightingale that sings
from behind the tree"

"Should be with me as it reciprocates
with a 'koo' whenever I call her

"Each day as it dawns

mother, I should walk in your footsteps

"Swinging off branches, playing in valleys

"I should be coddled in mother's lap everyday"

A good day of hunting, my dear?

Yes, it was rather.

Do you believe it?

It's amazing. Who's the artist?

What? This chit of a girl?

I want to have this little package
on our mantelpiece.

Edward.
-Yes, sir.

The Lady is giving you a token of
appreciation for your childs song.

Take it, Loki.

At your service, Lady.

At your service, Lord.

Malli.

Loki, the money they have given you
was not for the song.

Theyve bought your daughter.

MALLI!

Malli.

Malli... Malli...

My child.

Please give me back my child.

I fall at your feet.

I beg you!

I beg you.
I fall at your feet.

Sergeant!

Do you comprehend the value
of the b*llet in your barrel?

It was manufactured in an English factory
using English metals.

It crossed the 7 seas in an English vessel.

By the time it reached
the barrel of your g*n

it cost One Pound.

One Pound Sterling!

And you would want to squander it on

brown rubbish?

Clear the road.

Please dont take my child away
Im begging you

Lady, please.

THE FIRE
"""

# Preprocess script by splitting into sentences
def preprocess_script(script):
    return script.strip().split('.')[:100]  # Taking first 100 sentences

sentences = preprocess_script(script)

# 1. Using mBART for translation
def translate_with_mbart(sentences, target_lang='es'):
    model_name = 'facebook/mbart-large-50-many-to-many-mmt'
    tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
    model = MBartForConditionalGeneration.from_pretrained(model_name)

    tokenizer.src_lang = 'en_XX'
    translated_sentences = []

    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
        generated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id[target_lang])
        translated_sentences.append(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])

    return translated_sentences

# 2. Using MarianMT for translation
def translate_with_marian(sentences, target_lang='es'):
    model_name = f'Helsinki-NLP/opus-mt-en-{target_lang}'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    translated_sentences = []

    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
        translated = model.generate(**inputs)
        translated_sentences.append(tokenizer.decode(translated[0], skip_special_tokens=True))

    return translated_sentences

# 3. Using M2M100 for translation
def translate_with_m2m(sentences, target_lang='es'):
    model_name = "facebook/m2m100_418M"
    tokenizer = M2M100Tokenizer.from_pretrained(model_name)
    model = M2M100ForConditionalGeneration.from_pretrained(model_name)

    tokenizer.src_lang = "en"
    translated_sentences = []

    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
        generated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id(target_lang))
        translated_sentences.append(tokenizer.decode(generated_tokens[0], skip_special_tokens=True))

    return translated_sentences

# Translate to Spanish and German using each model
spanish_mbart = translate_with_mbart(sentences, 'es_XX')
german_mbart = translate_with_mbart(sentences, 'de_DE')

spanish_marian = translate_with_marian(sentences, 'es')
german_marian = translate_with_marian(sentences, 'de')

spanish_m2m = translate_with_m2m(sentences, 'es')
german_m2m = translate_with_m2m(sentences, 'de')

# Output translations
print(f"MBART Spanish: {spanish_mbart[:5]}")
print(f"MarianMT Spanish: {spanish_marian[:5]}")
print(f"M2M100 Spanish: {spanish_m2m[:5]}")




Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


tokenizer_config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

MBART Spanish: ['Sin embargo, en el caso de los niños, la mayoría de los niños que viven en zonas rurales no tienen acceso a los servicios de salud.', '¿Creen? Es increíble.', '¿Quién es el artista? ¿Qué? Este chit de una chica?', 'El Presidente (habla en inglés): De conformidad con el entendimiento alcanzado en las consultas previas del Consejo, el Consejo de Seguridad comenzará ahora el examen del tema que figura en el orden del día.', '- Sí, señor.']
MarianMT Spanish: ['Deslizando las ramas, jugando en los valles debería ser mimado en el regazo de la madre todos los días "El ruiseñor que canta desde detrás del árbol" "Debe estar conmigo como corresponde con un \'koo\' cada vez que la llamo "Cada día que amanezca madre, debo caminar en tus pasos "Deslizando las ramas, jugando en valles" "Debería ser mimado en el regazo de la madre todos los días" ¿Un buen día de caza, querida? Sí, fue más bien', '¿Lo crees?', '¿Quién es el artista? ¿Qué? ¿Esta canción de una chica? Quiero tener este 

In [6]:
# Ground truth translation (use output from GPT-4 for ground truth)
# Example reference for first 5 sentences (wrapped in an additional list)
references = [
    ["Ground truth translation 1", "Ground truth translation 2"]  # Replace with actual ground truth
]

# Note: sacrebleu expects a list of lists, even if there is one reference per hypothesis
references = [[ref] for ref in references]

# 1. Compute BLEU Score
def compute_bleu(hypotheses, references):
    return corpus_bleu(hypotheses, references).score

# 2. Compute ROUGE Score
def compute_rouge(hypotheses, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    scores = [scorer.score(hyp, ref[0]) for hyp, ref in zip(hypotheses, references)]
    return scores

# Example: Evaluate Marian for Spanish
bleu_score = compute_bleu(spanish_marian, references)
rouge_scores = compute_rouge(spanish_marian, references)

print(f"BLEU Score: {bleu_score}")
print(f"ROUGE Scores: {rouge_scores}")


TypeError: BLEU: `refs` should be a sequence of sequence of strings.