# Neural Machine Translation

# Professor: Khaled Sayed
                       Presenter: 1. Thriyogya Kokirala            ID:00785999
                                  2. Poojitha Mandapati           ID:00797556


# Step 1: Load Dataset

In [3]:
import pandas as pd
data_file = 'language processing dataset/Language Detection.csv'
df = pd.read_csv(data_file)


In [4]:
input_texts = df['Text'].tolist()
target_languages = df['Language'].tolist()


# Step 2: Data Preprocessing


In [5]:
def preprocess_text(text):
    text = text.lower()
    text = ''.join(char for char in text if char.isalpha() or char.isspace())
    return text

preprocessed_texts = [preprocess_text(text) for text in input_texts]


# Step 3: Language Detection

In [6]:
!pip install langdetect




In [7]:
from langdetect import detect_langs

# Function to detect language with confidence threshold
def detect_language_with_threshold(input_text, confidence_threshold=0.8):
    try:
        lang_results = detect_langs(input_text)
        most_probable_lang = lang_results[0]
        
        if most_probable_lang.lang == 'unknown' or most_probable_lang.prob < confidence_threshold:
            return "Unknown"
        else:
            return most_probable_lang.lang
    except Exception as e:
        print(f"Error detecting language for text: {input_text}")
        return "Unknown"
    
for text in preprocessed_texts[:5]:
    detected_language = detect_language_with_threshold(text)
    print(f"Preprocessed Text: '{text}' | Detected Language: {detected_language}")


Preprocessed Text: ' nature in the broadest sense is the natural physical material world or universe' | Detected Language: en
Preprocessed Text: 'nature can refer to the phenomena of the physical world and also to life in general' | Detected Language: en
Preprocessed Text: 'the study of nature is a large if not the only part of science' | Detected Language: en
Preprocessed Text: 'although humans are part of nature human activity is often understood as a separate category from other natural phenomena' | Detected Language: en
Preprocessed Text: ' the word nature is borrowed from the old french nature and is derived from the latin word natura or essential qualities innate disposition and in ancient times literally meant birth' | Detected Language: en


# Step 4: Load Translation Model

In [8]:
!pip install sentencepiece




In [9]:
from transformers import MarianMTModel, MarianTokenizer

model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'  # English to Romance languages
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)


Downloading tokenizer_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

# Step 5: Text Translation Function


In [19]:

# Function to translate text
def translate_text(text, target_language='fr'):  # Specify the target language code
    # Tokenize input text
    input_ids = tokenizer.encode(text, return_tensors='pt')

    # Generate translation
    translation_ids = model.generate(input_ids, max_length=150, num_beams=5, early_stopping=True)
    
    # Decode and return the translation
    translated_text = tokenizer.decode(translation_ids[0], skip_special_tokens=True)
    return translated_text

# Example usage for translation
target_language_code = 'fr'  # Example: French
for i, text in enumerate(input_texts[:5]):
    translation = translate_text(text, target_language=target_language_code)
    print(f"Input Text ({target_language_code}): '{text}' | Translated Text: {translation}")


Input Text (fr): ' Nature, in the broadest sense, is the natural, physical, material world or universe.' | Translated Text: La naturaleza, au sens large, est el mundo natural, físico, material, o universo.
Input Text (fr): '"Nature" can refer to the phenomena of the physical world, and also to life in general.' | Translated Text: "Naturaleza" puede referirse a los fenómenos del mundo físico, y también a la vida en general.
Input Text (fr): 'The study of nature is a large, if not the only, part of science.' | Translated Text: El estudio de la naturaleza es una parte importante, se no la única, de la ciencia.
Input Text (fr): 'Although humans are part of nature, human activity is often understood as a separate category from other natural phenomena.' | Translated Text: Bien que les seres humanos formen parte de la naturaleza, la actividad humana se entend souvent como una categoría separada de otros fenómenos naturales.
Input Text (fr): '[1] The word nature is borrowed from the Old French

# Step 6: Calculate BLEU Score


In [None]:
pip install sacrebleu

In [8]:
import sacrebleu

reference_translations = [['reference_translation_1', 'reference_translation_2'], ['reference_translation_3', 'reference_translation_4']]

# Placeholder for `input_texts`
input_texts = ['input_text_1', 'input_text_2', 'input_text_3', 'input_text_4']

# the target language code
target_language_code = 'fr'

def translate_text(text, target_language):
    translated_text = f"Translated: {text}" 
    return translated_text

hypotheses = [translate_text(text, target_language=target_language_code) for text in input_texts]

references_combined = [" ".join(refs) for refs in reference_translations]

# Calculate BLEU score using sacrebleu
bleu_score = sacrebleu.corpus_bleu(hypotheses, [references_combined])
print(f"BLEU Score: {bleu_score.score}")


BLEU Score: 5.379525625492818
