# English Grammar Correction

In [18]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the English grammar correction model
model_name = "vennify/t5-base-grammar-correction"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Function to correct grammar for English
def correct_grammar(text):
    input_text = "grammar: " + text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs['input_ids'], max_length=512, num_beams=5, early_stopping=True)
    corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_text

sentence = "In earlier, you would undoubtedly phone travel agent to book."
corrected_sentence = correct_grammar(sentence)
print(f"Original: {sentence}")
print(f"Corrected: {corrected_sentence}")

Original: In earlier, you would undoubtedly phone travel agent to book.
Corrected: In earlier days, you would undoubtedly phone a travel agent to book.


# MBART model for Grammar Correction

In [1]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

#### fails for English to English

In [2]:
# translate English to English
article_en = "Hey Ram how are you?."

tokenizer.src_lang = "en_XX"
encoded_ar = tokenizer(article_en, return_tensors="pt")
generated_tokens = model.generate(
    **encoded_ar,
    forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]
)
result = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
print(result)

tokenizer.src_lang = "en_XX"
encoded_ar = tokenizer(article_en, return_tensors="pt")
generated_tokens = model.generate(
    **encoded_ar,
    forced_bos_token_id=tokenizer.lang_code_to_id["hi_IN"]
)
result = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
print(result)

tokenizer.src_lang = "hi_IN"
encoded_ar = tokenizer(result, return_tensors="pt")
generated_tokens = model.generate(
    **encoded_ar,
    forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]
)
result = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
result

['හේයි රම් කොහොමද ඔයාට?']
['अरे राम तुम कैसे हो?.']


['Oh, Ram, how are you?']

### English to Hindi Translation for Different Addressing Terms

In [11]:
# translate English to Hindi
article_en = "Hello brother, how are you?"

tokenizer.src_lang = "en_XX"
encoded_ar = tokenizer(article_en, return_tensors="pt")
generated_tokens = model.generate(
    **encoded_ar,
    forced_bos_token_id=tokenizer.lang_code_to_id["hi_IN"]
)
result = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
print(result)

tokenizer.src_lang = "hi_IN"
encoded_ar = tokenizer(result, return_tensors="pt")
generated_tokens = model.generate(
    **encoded_ar,
    forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]
)
result = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
result

['भाइयों, आप कैसे हैं?']


['Brothers, how are you?']

In [10]:
# translate English to Hindi
article_en = "Hello sir, how are you?"

tokenizer.src_lang = "en_XX"
encoded_ar = tokenizer(article_en, return_tensors="pt")
generated_tokens = model.generate(
    **encoded_ar,
    forced_bos_token_id=tokenizer.lang_code_to_id["hi_IN"]
)
result = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
print(result)

tokenizer.src_lang = "hi_IN"
encoded_ar = tokenizer(result, return_tensors="pt")
generated_tokens = model.generate(
    **encoded_ar,
    forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]
)
result = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
result

['नमस्ते सर, आप कैसे हैं?']


['Hello sir, how are you?']

In [75]:
article_hi = "संयुक्त राष्ट् के प्रमुख का कहन है कि सीरिया मे कोई सैन्य समाधान नहीं ह"

tokenizer.src_lang = "hi_IN"
encoded_hi = tokenizer(article_hi, return_tensors="pt")
generated_tokens = model.generate(
    **encoded_hi,
    forced_bos_token_id=tokenizer.lang_code_to_id["hi_IN"]
)
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

['संयुक्त राष्ट्र के प्रमुख ने कहा कि सीरिया में कोई सैन्य समाधान नहीं है']

In [61]:
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

def correct_grammar_mbart(text, src_lang):
    tokenizer.src_lang = src_lang
    encoded_input = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    
    generated_tokens = model.generate(
        **encoded_input,
        max_length=512,
        num_beams=5,
        early_stopping=True,
        forced_bos_token_id=tokenizer.lang_code_to_id[src_lang]
    )
    
    corrected_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    return corrected_text

# Test sentences in different languages
sentences = {
    "Hindi": ("वह हर दिन स्कूल जात है।", "hi_IN"),
    "French": ("Je la mode.", "fr_XX"),
    "Spanish": ("Madrid eres una ciudad hermosa.", "es_XX")
}

# Loop through the sentences and correct grammar using MBART
for language, (sentence, lang_code) in sentences.items():
    corrected_sentence = correct_grammar_mbart(sentence, lang_code)
    print(f"Original ({language}): {sentence}")
    print(f"Corrected ({language}): {corrected_sentence}\n")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Original (Hindi): वह हर दिन स्कूल जात है।
Corrected (Hindi): वह हर दिन स्कूल जाता है।

Original (French): Je la mode.
Corrected (French): Je suis en mode.

Original (Spanish): Madrid eres una ciudad hermosa.
Corrected (Spanish): Madrid es una ciudad bella.



In [5]:
import ipywidgets as widgets
from IPython.display import display, HTML
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

# Detect errors in the original text and highlight them in the corrected text
def highlight_errors(original, corrected):
    original_tokens = original.split()
    corrected_tokens = corrected.split()
    highlighted = []
    for o_token, c_token in zip(original_tokens, corrected_tokens):
        if o_token != c_token:
            highlighted.append(f"<del style='color:red;'>{o_token}</del> <span style='color:green;'>{c_token}</span>")
        else:
            highlighted.append(o_token)
    return " ".join(highlighted)

def correct_grammar_mbart(text, src_lang):
    tokenizer.src_lang = src_lang
    encoded_input = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    
    generated_tokens = model.generate(
        **encoded_input,
        max_length=512,
        num_beams=5,
        early_stopping=True,
        forced_bos_token_id=tokenizer.lang_code_to_id[src_lang]
    )
    
    corrected_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    return corrected_text

# Test sentences in different languages
sentences = {
    "Hindi": ("वह हर दिन स्कूल जात है।", "hi_IN"),
    "French": ("Je la mode.", "fr_XX"),
    "Spanish": ("Madrid eres una ciudad hermosa.", "es_XX")
}

# Loop through the sentences and correct grammar using MBART
for language, (sentence, lang_code) in sentences.items():
    corrected_sentence = correct_grammar_mbart(sentence, lang_code)
    highlighted_text = highlight_errors(sentence, corrected_sentence)
    
    display(HTML(f"""
    <div style="margin-bottom: 10px;">
        <b>{language} (Original with corrections):</b><br>
        <div>{highlighted_text}</div>
    </div>
    """))


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
