In [1]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import warnings
warnings.filterwarnings('ignore')

# Load dataset
print("### Loading Dataset ###")
data = {
    'english': [
        'Hello, how are you?',
        'What is your name?',
        'The weather is nice today',
        'I love machine learning',
        'This is a simple test'
    ],
    'french': [
        'Bonjour, comment allez-vous?',
        'Comment vous appelez-vous?',
        'Le temps est beau aujourd\'hui',
        'J\'aime l\'apprentissage automatique',
        'Ceci est un test simple'
    ]
}
df = pd.DataFrame(data)
display(df)

# Statistical Machine Translation (SMT) - Dictionary Based
print("### SMT Translation ###")
smt_dict = {
    'Hello': 'Bonjour', 'how': 'comment', 'are': 'allez', 'you': 'vous',
    'What': 'Que', 'is': 'est', 'your': 'votre', 'name': 'nom',
    'the': 'le', 'weather': 'temps', 'nice': 'beau', 'today': "aujourd'hui",
    'I': 'Je', 'love': 'aime', 'machine': 'machine', 'learning': 'apprentissage',
    'this': 'ceci', 'a': 'un', 'simple': 'simple', 'test': 'test'
}

def smt_translate(sentence):
    words = sentence.split()
    translated = [smt_dict.get(word.rstrip('?,.!').lower().capitalize(), word) for word in words]
    return ' '.join(translated)

df['smt_translation'] = df['english'].apply(smt_translate)
display(df[['english', 'french', 'smt_translation']])

# BLEU Score Calculation for SMT
smoother = SmoothingFunction().method1
df['smt_bleu'] = df.apply(lambda row: sentence_bleu([row['french'].split()], row['smt_translation'].split(), smoothing_function=smoother), axis=1)
print(f"Average BLEU Score for SMT: {df['smt_bleu'].mean():.2f}")

# Installing required libraries
!pip install -q transformers torch

# Seq2Seq Translation
print("### Seq2Seq Translation ###")
from transformers import pipeline
seq2seq_translator = pipeline('translation_en_to_fr', model='Helsinki-NLP/opus-mt-en-fr')

def seq2seq_translate(text):
    return seq2seq_translator(text, max_length=50)[0]['translation_text']

df.loc[:2, 'seq2seq_translation'] = df.loc[:2, 'english'].apply(seq2seq_translate)
display(df[['english', 'french', 'seq2seq_translation']].head(3))

df.loc[:2, 'seq2seq_bleu'] = df.loc[:2].apply(lambda row: sentence_bleu([row['french'].split()], row['seq2seq_translation'].split(), smoothing_function=smoother), axis=1)
print(f"Average BLEU Score for Seq2Seq: {df.loc[:2, 'seq2seq_bleu'].mean():.2f}")

# Transformer Model (T5)
print("### T5 Translation ###")
t5_translator = pipeline('translation_en_to_fr', model='t5-small')

def t5_translate(text):
    return t5_translator(text, max_length=50)[0]['translation_text']

df.loc[:2, 't5_translation'] = df.loc[:2, 'english'].apply(t5_translate)
display(df[['english', 'french', 't5_translation']].head(3))

df.loc[:2, 't5_bleu'] = df.loc[:2].apply(lambda row: sentence_bleu([row['french'].split()], row['t5_translation'].split(), smoothing_function=smoother), axis=1)
print(f"Average BLEU Score for T5: {df.loc[:2, 't5_bleu'].mean():.2f}")

# M2M-100 Translation
print("### M2M-100 Translation ###")
m2m_translator = pipeline('translation', model='facebook/m2m100_418M')

def m2m_translate(text):
    return m2m_translator(text, src_lang='en', tgt_lang='fr', max_length=50)[0]['translation_text']

df.loc[:2, 'm2m_translation'] = df.loc[:2, 'english'].apply(m2m_translate)
display(df[['english', 'french', 'm2m_translation']].head(3))

df.loc[:2, 'm2m_bleu'] = df.loc[:2].apply(lambda row: sentence_bleu([row['french'].split()], row['m2m_translation'].split(), smoothing_function=smoother), axis=1)
print(f"Average BLEU Score for M2M-100: {df.loc[:2, 'm2m_bleu'].mean():.2f}")

# Results Comparison
print("### BLEU Score Comparison ###")
results = pd.DataFrame({
    'Model': ['SMT', 'Seq2Seq', 'T5', 'M2M-100'],
    'BLEU Score': [
        df['smt_bleu'].mean(),
        df.loc[:2, 'seq2seq_bleu'].mean(),
        df.loc[:2, 't5_bleu'].mean(),
        df.loc[:2, 'm2m_bleu'].mean()
    ]
})
display(results.sort_values('BLEU Score', ascending=False))

ModuleNotFoundError: No module named 'nltk'