# Entity-Aware Machine Translation (EA-MT) with NER

This notebook demonstrates a complete pipeline for entity-aware English-to-French machine translation using multi-task learning with NER. It integrates all code from the `src` folder, including data preparation, baseline translation, entity-aware translation, fine-tuning, and evaluation.

---

## 1. Setup & Environment Check
Check for GPU availability and install required packages.


In [1]:
import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))
    print("Device count:", torch.cuda.device_count())
    print("Current device:", torch.cuda.current_device())
else:
    print("No GPU detected. Training will use CPU.")


CUDA available: True
GPU name: NVIDIA GeForce RTX 4070 Laptop GPU
Device count: 1
Current device: 0


## 2. Data Preparation
Prepare the dataset for NER and translation.


In [5]:
import pandas as pd
import requests
from nltk.tokenize import word_tokenize
import nltk

# Ensure NLTK data is available
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    nltk.download('punkt')

def get_label_from_wikidata(qid):
    """Fetches the English label for a given Wikidata QID."""
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
        data = response.json()
        return data['entities'][qid]['labels']['en']['value']
    except (requests.exceptions.RequestException, KeyError, ValueError) as e:
        print(f"Could not fetch label for {qid}: {e}")
        return None

def create_qid_to_label_mapping(df):
    """Creates a mapping from QID to its English label."""
    all_qids = set(qid for entity_list in df['entities'] for qid in entity_list)
    qid_to_label = {qid: get_label_from_wikidata(qid) for qid in all_qids}
    return qid_to_label

def tokenize_and_iob(row, qid_to_label):
    """Tokenizes source text and creates IOB tags for entities."""
    text = row['source']
    tokens = word_tokenize(text)
    labels = ['O'] * len(tokens)

    for qid in row['entities']:
        entity_text = qid_to_label.get(qid)
        if not entity_text:
            continue
        
        entity_tokens = word_tokenize(entity_text)
        if not entity_tokens:
            continue

        # Find entity in tokens and apply IOB tags
        for i in range(len(tokens) - len(entity_tokens) + 1):
            if tokens[i:i+len(entity_tokens)] == entity_tokens:
                labels[i] = 'B-ENT'
                for j in range(1, len(entity_tokens)):
                    labels[i+j] = 'I-ENT'
                break  # Move to the next qid once tagged

    return list(zip(tokens, labels))

def prepare_data(file_path):
    """
    Loads data from a JSONL file and prepares it for NER and translation.
    
    Args:
        file_path (str): The path to the .jsonl file.
        
    Returns:
        pandas.DataFrame: A DataFrame with an added 'token_iob' column.
    """
    print("Loading data...")
    df = pd.read_json(file_path, lines=True)
    
    print("Fetching entity labels from Wikidata...")
    qid_to_label = create_qid_to_label_mapping(df)
    
    print("Tokenizing and creating IOB tags...")
    df['token_iob'] = df.apply(lambda row: tokenize_and_iob(row, qid_to_label), axis=1)
    
    print("Data preparation complete.")
    return df




In [7]:
train_file = r'E:\AISD\Term2\NLP\Project\NER_SemEval_2025\Data\references\train\fr\train.jsonl'

prepared_df = prepare_data(train_file)

# Display info and head of the processed DataFrame
print("\nDataFrame Info:")
prepared_df.info()
print("\nFirst 5 rows of prepared data:")
print(prepared_df.head())
print("\nExample of token_iob column:")
print(prepared_df['token_iob'].iloc[0])

# Save the prepared DataFrame as a CSV file in the data folder
output_path = r'E:\AISD\Term2\NLP\Project\NER_SemEval_2025\entity_aware_mt\data\prepared_data.csv'
prepared_df.to_csv(output_path, index=False)
print(f"\nPrepared data saved to {output_path}")

Loading data...
Fetching entity labels from Wikidata...
Could not fetch label for Q23: 'en'
Tokenizing and creating IOB tags...
Data preparation complete.

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5531 entries, 0 to 5530
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             5531 non-null   object
 1   source_locale  5531 non-null   object
 2   target_locale  5531 non-null   object
 3   source         5531 non-null   object
 4   target         5531 non-null   object
 5   entities       5531 non-null   object
 6   from           5531 non-null   object
 7   token_iob      5531 non-null   object
dtypes: object(8)
memory usage: 345.8+ KB

First 5 rows of prepared data:
         id source_locale target_locale  \
0  a9011ddf            en            fr   
1  982450cf            en            fr   
2  b218d184            en            fr   
3  f477742c            en            fr   
4  

## 3. Baseline Translation
Translate English sentences to French using a pre-trained MarianMT model.


In [14]:
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer
import os

def load_data(csv_path):
    """Loads the prepared CSV data."""
    return pd.read_csv(csv_path)

def translate_sentences(sentences, model_name="Helsinki-NLP/opus-mt-en-fr", batch_size=8):
    """
    Translates a list of English sentences to French using a pre-trained model.
    Args:
        sentences (list): List of English sentences.
        model_name (str): Hugging Face model name.
        batch_size (int): Number of sentences per batch.
    Returns:
        list: Translated French sentences.
    """
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    translations = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        translated = model.generate(**inputs)
        outputs = tokenizer.batch_decode(translated, skip_special_tokens=True)
        translations.extend(outputs)
    return translations

# Set the path to your prepared data CSV (update this as needed)
data_path = r"E:\AISD\Term2\NLP\Project\NER_SemEval_2025\entity_aware_mt\data\prepared_data.csv"
output_path = r"E:\AISD\Term2\NLP\Project\NER_SemEval_2025\entity_aware_mt\data\baseline_translations.csv"

# Load data
df = load_data(data_path)
print(f"Loaded {len(df)} rows from {data_path}")

# Translate source sentences
print("Translating source sentences using Helsinki-NLP/opus-mt-en-fr...")
df["mt_baseline"] = translate_sentences(df["source"].tolist())

# Save the results
df.to_csv(output_path, index=False)
print(f"Baseline translations saved to {output_path}")

Loaded 5531 rows from E:\AISD\Term2\NLP\Project\NER_SemEval_2025\entity_aware_mt\data\prepared_data.csv
Translating source sentences using Helsinki-NLP/opus-mt-en-fr...




Baseline translations saved to E:\AISD\Term2\NLP\Project\NER_SemEval_2025\entity_aware_mt\data\baseline_translations.csv


## 4. Entity-Aware Pipeline
Inject placeholders for entities, translate, and post-process.


In [15]:
import pandas as pd
import ast

def extract_entity_spans(token_iob):
    """Extracts spans for entities using IOB tags."""
    spans = []
    current = None
    for idx, (token, tag) in enumerate(token_iob):
        if tag == 'B-ENT':
            if current:
                spans.append(current)
            current = [idx, idx]
        elif tag == 'I-ENT' and current:
            current[1] = idx
        else:
            if current:
                spans.append(current)
                current = None
    if current:
        spans.append(current)
    return spans

def inject_placeholders(row):
    """Replaces entity spans with placeholders in the sentence."""
    token_iob = ast.literal_eval(row['token_iob'])
    tokens = [tok for tok, tag in token_iob]
    spans = extract_entity_spans(token_iob)
    entities = ast.literal_eval(row['entities'])
    placeholder_map = {}
    new_tokens = tokens[:]
    for idx, span in enumerate(spans):
        placeholder = f"@ENTITY{idx+1}@"
        # Replace entity tokens with placeholder
        start, end = span
        new_tokens[start:end+1] = [placeholder]
        placeholder_map[placeholder] = entities[idx] if idx < len(entities) else None
    row['placeholder_sentence'] = ' '.join(new_tokens)
    row['placeholder_map'] = placeholder_map
    return row

# Set the path to your prepared data CSV (update this as needed)
data_path = r"E:\AISD\Term2\NLP\Project\NER_SemEval_2025\entity_aware_mt\data\prepared_data.csv"
output_path = r"E:\AISD\Term2\NLP\Project\NER_SemEval_2025\entity_aware_mt\data\entity_placeholders.csv"

# Load data
df = pd.read_csv(data_path)
print(f"Loaded {len(df)} rows from {data_path}")

# Inject placeholders
print("Injecting placeholders for entities...")
df = df.apply(inject_placeholders, axis=1)

# Save the results
df.to_csv(output_path, index=False)
print(f"Placeholder-injected data saved to {output_path}")

Loaded 5531 rows from E:\AISD\Term2\NLP\Project\NER_SemEval_2025\entity_aware_mt\data\prepared_data.csv
Injecting placeholders for entities...
Placeholder-injected data saved to E:\AISD\Term2\NLP\Project\NER_SemEval_2025\entity_aware_mt\data\entity_placeholders.csv


In [16]:
# Translate placeholder-injected sentences
# df = pd.read_csv('entity_placeholders.csv')
# df['mt_placeholder'] = translate_sentences(df['placeholder_sentence'].tolist())
# df.to_csv('entity_placeholders_translated.csv', index=False)

import pandas as pd
from transformers import MarianMTModel, MarianTokenizer

def load_data(csv_path):
    """Loads the placeholder-injected CSV data."""
    return pd.read_csv(csv_path)

def translate_sentences(sentences, model_name="Helsinki-NLP/opus-mt-en-fr", batch_size=8):
    """
    Translates a list of English sentences to French using a pre-trained model.
    Args:
        sentences (list): List of English sentences.
        model_name (str): Hugging Face model name.
        batch_size (int): Number of sentences per batch.
    Returns:
        list: Translated French sentences.
    """
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    translations = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        translated = model.generate(**inputs)
        outputs = tokenizer.batch_decode(translated, skip_special_tokens=True)
        translations.extend(outputs)
    return translations

# Set the path to your placeholder-injected data CSV
data_path = r"E:\AISD\Term2\NLP\Project\NER_SemEval_2025\entity_aware_mt\data\entity_placeholders.csv"
output_path = r"E:\AISD\Term2\NLP\Project\NER_SemEval_2025\entity_aware_mt\data\entity_placeholders_translated.csv"

# Load data
df = load_data(data_path)
print(f"Loaded {len(df)} rows from {data_path}")

# Translate placeholder-injected sentences
print("Translating placeholder-injected sentences...")
df["mt_placeholder"] = translate_sentences(df["placeholder_sentence"].tolist())

# Save the results
df.to_csv(output_path, index=False)
print(f"Entity-aware translations saved to {output_path}")


Loaded 5531 rows from E:\AISD\Term2\NLP\Project\NER_SemEval_2025\entity_aware_mt\data\entity_placeholders.csv
Translating placeholder-injected sentences...




Entity-aware translations saved to E:\AISD\Term2\NLP\Project\NER_SemEval_2025\entity_aware_mt\data\entity_placeholders_translated.csv


In [17]:
import pandas as pd
import ast
import requests

def get_label_from_wikidata(qid, lang='fr'):
    """Fetches the label for a given Wikidata QID in the specified language."""
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        return data['entities'][qid]['labels'][lang]['value']
    except Exception as e:
        print(f"Could not fetch label for {qid}: {e}")
        return None

def replace_placeholders(row):
    sentence = row['mt_placeholder']
    placeholder_map = ast.literal_eval(row['placeholder_map']) if isinstance(row['placeholder_map'], str) else row['placeholder_map']
    for placeholder, qid in placeholder_map.items():
        if qid:
            fr_label = get_label_from_wikidata(qid, lang='fr')
            if not fr_label:
                fr_label = qid  # fallback to QID if label not found
            sentence = sentence.replace(placeholder, fr_label)
    row['mt_entity_aware'] = sentence
    return row

# Set the path to your translated placeholder-injected CSV
data_path = r"E:\AISD\Term2\NLP\Project\NER_SemEval_2025\entity_aware_mt\data\entity_placeholders_translated.csv"
output_path = r"E:\AISD\Term2\NLP\Project\NER_SemEval_2025\entity_aware_mt\data\entity_aware_translations.csv"

# Load data
df = pd.read_csv(data_path)
print(f"Loaded {len(df)} rows from {data_path}")

# Replace placeholders with French entity labels
print("Replacing placeholders with French entity labels...")
df = df.apply(replace_placeholders, axis=1)

# Save the final entity-aware translations
df.to_csv(output_path, index=False)
print(f"Entity-aware translations saved to {output_path}")

Loaded 5531 rows from E:\AISD\Term2\NLP\Project\NER_SemEval_2025\entity_aware_mt\data\entity_placeholders_translated.csv
Replacing placeholders with French entity labels...
Could not fetch label for Q23: 'fr'
Entity-aware translations saved to E:\AISD\Term2\NLP\Project\NER_SemEval_2025\entity_aware_mt\data\entity_aware_translations.csv


## 5. Fine-tuning MarianMT (Optional)
Fine-tune the translation model on placeholder-injected data.


In [18]:
import os
import pandas as pd
from datasets import Dataset
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import torch

def load_data(csv_path):
    df = pd.read_csv(csv_path)
    # Use placeholder-injected English as source, and placeholder-injected French as target
    # For training, you need to have both. If you don't have placeholder-injected French, use the original French with placeholders inserted at the same positions as in English.
    # Here, we assume you have a column 'placeholder_sentence' (English) and 'target_placeholder' (French)
    # If not, you may need to generate 'target_placeholder' first.
    if 'target_placeholder' not in df.columns:
        # Fallback: use 'target' (reference French) for now
        df['target_placeholder'] = df['target']
    return df[['placeholder_sentence', 'target_placeholder']]

def preprocess_function(examples, tokenizer, max_length=128):
    model_inputs = tokenizer(examples['placeholder_sentence'], max_length=max_length, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['target_placeholder'], max_length=max_length, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Set your paths and parameters
model_name = "Helsinki-NLP/opus-mt-en-fr"
data_path = r"E:\AISD\Term2\NLP\Project\NER_SemEval_2025\entity_aware_mt\data\entity_placeholders.csv"
output_dir = r"E:\AISD\Term2\NLP\Project\NER_SemEval_2025\entity_aware_mt\finetuned_placeholder_mt"
batch_size = 8
num_train_epochs = 3
max_length = 128

# Load data
df = load_data(data_path)
dataset = Dataset.from_pandas(df)
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Preprocess
tokenized_dataset = dataset.map(lambda x: preprocess_function(x, tokenizer, max_length), batched=True)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=500,
    save_total_limit=2,
    eval_strategy="no",
    logging_steps=100,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    report_to=["none"],
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

# Save model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Fine-tuned model saved to {output_dir}")




Map:   0%|          | 0/5531 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Step,Training Loss
100,0.754
200,0.1748
300,0.1609
400,0.1586
500,0.1455
600,0.1507
700,0.14
800,0.1071
900,0.1069
1000,0.1005




Fine-tuned model saved to E:\AISD\Term2\NLP\Project\NER_SemEval_2025\entity_aware_mt\finetuned_placeholder_mt


## 6. Predict with Fine-tuned Model (Optional)


In [20]:
import os
import json
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer
from tqdm import tqdm
import torch

def load_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

def save_jsonl(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')

def translate_sentences(sentences, model, tokenizer, batch_size=8):
    translations = []
    for i in tqdm(range(0, len(sentences), batch_size)):
        batch = sentences[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(model.device)
        translated = model.generate(**inputs)
        outputs = tokenizer.batch_decode(translated, skip_special_tokens=True)
        translations.extend(outputs)
    return translations

# Set paths
val_path = r"E:\AISD\Term2\NLP\Project\NER_SemEval_2025\Data\references\validation\fr_FR.jsonl"
model_dir = r"E:\AISD\Term2\NLP\Project\NER_SemEval_2025\entity_aware_mt\finetuned_placeholder_mt"
output_dir = r"E:\AISD\Term2\NLP\Project\NER_SemEval_2025\Data\predictions\finetuned_placeholder_mt"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "fr_FR.jsonl")

# Load validation data
val_data = load_jsonl(val_path)
# Determine which key holds the source text
if 'placeholder_sentence' in val_data[0]:
    src_texts = [ex['placeholder_sentence'] for ex in val_data]
elif 'text' in val_data[0]:
    src_texts = [ex['text'] for ex in val_data]
elif 'source' in val_data[0]:
    src_texts = [ex['source'] for ex in val_data]
else:
    raise ValueError("Could not find source text key in validation data.")

# Load model and tokenizer
tokenizer = MarianTokenizer.from_pretrained(model_dir)
model = MarianMTModel.from_pretrained(model_dir)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Translate
translations = translate_sentences(src_texts, model, tokenizer)

# Save predictions in same JSONL structure, add 'prediction' key
for ex, pred in zip(val_data, translations):
    ex['prediction'] = pred
save_jsonl(val_data, output_path)
print(f"Predictions saved to {output_path}")

100%|██████████| 91/91 [00:16<00:00,  5.48it/s]

Predictions saved to E:\AISD\Term2\NLP\Project\NER_SemEval_2025\Data\predictions\finetuned_placeholder_mt\fr_FR.jsonl





## 7. Evaluation
Evaluate translation quality using COMET.

For Evaluation, Check eval.ipynb file
