---
# main.py 

In [1]:
#!/usr/bin/env python3
import os
import re
import unicodedata
import sentencepiece as spm

In [2]:
def normalize_unicode(text):
    """
    Normalize text using NFC Unicode normalization.
    """
    return unicodedata.normalize('NFC', text)

def clean_text(text, lang='en'):
    """
    Clean and normalize text:
      - Strips extra spaces.
      - Applies Unicode normalization.
      - Removes extra punctuation characters (customize as needed).
      - Converts English text to lowercase.
    """
    text = text.strip()
    text = normalize_unicode(text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Remove some unwanted punctuation (customize if needed)
    text = re.sub(r'[“”‘’]', '', text)
    if lang == 'en':
        text = text.lower()
    return text

def add_language_tag(text, lang_tag):
    """
    Prepend a language tag to a sentence.
    """
    return f"{lang_tag} {text}"

def preprocess_file(input_file, output_file, lang_tag, lang='en'):
    """
    Reads an input file line by line, cleans each line,
    prepends a language tag, and writes the result to output_file.
    """
    lines_cleaned = []
    with open(input_file, 'r', encoding='utf-8') as f_in:
        lines = f_in.readlines()
        for line in lines:
            cleaned = clean_text(line, lang=lang)
            tagged = add_language_tag(cleaned, lang_tag)
            lines_cleaned.append(tagged)
    
    with open(output_file, 'w', encoding='utf-8') as f_out:
        for line in lines_cleaned:
            f_out.write(line + "\n")
    
    print(f"Preprocessed {input_file} -> {output_file}")
    return output_file

def combine_files(file_list, combined_file):
    """
    Combine multiple text files into one.
    """
    with open(combined_file, 'w', encoding='utf-8') as f_out:
        for file in file_list:
            with open(file, 'r', encoding='utf-8') as f_in:
                f_out.write(f_in.read())
    print(f"Combined files into {combined_file}")
    return combined_file

def train_sentencepiece_model(input_file, model_prefix, vocab_size=32000):
    """
    Train a SentencePiece model on the provided input file.
    This generates two files: {model_prefix}.model and {model_prefix}.vocab.
    """
    spm.SentencePieceTrainer.train(
        f"--input={input_file} --model_prefix={model_prefix} --vocab_size={vocab_size} "
        f"--model_type=bpe --character_coverage=1.0"
    )
    print(f"Trained SentencePiece model: {model_prefix}.model and {model_prefix}.vocab")

def tokenize_file(input_file, output_file, model_file):
    """
    Tokenize an input file using the trained SentencePiece model,
    and write the tokenized sentences to the output file.
    """
    sp = spm.SentencePieceProcessor(model_file=model_file)
    with open(input_file, 'r', encoding='utf-8') as f_in, \
         open(output_file, 'w', encoding='utf-8') as f_out:
        for line in f_in:
            pieces = sp.encode_as_pieces(line.strip())
            f_out.write(" ".join(pieces) + "\n")
    print(f"Tokenized {input_file} -> {output_file}")

def main():
    # Define file names for your development data
    eng_dev = 'eng.dev'
    ben_dev = 'ben.dev'
    
    # Output file names for the cleaned data
    eng_clean = 'eng_clean.txt'
    ben_clean = 'ben_clean.txt'
    
    # Combine cleaned file for SentencePiece training
    combined_file = 'combined.txt'
    
    # SentencePiece model settings
    model_prefix = 'spm_model'
    model_file = f"{model_prefix}.model"
    
    # Output file names for tokenized data
    eng_tokenized = 'eng_tokenized.txt'
    ben_tokenized = 'ben_tokenized.txt'
    
    # 1. Preprocess each file (clean text and add language tags)
    preprocess_file(eng_dev, eng_clean, '<en>', lang='en')
    preprocess_file(ben_dev, ben_clean, '<bn>', lang='bn')
    
    # 2. Combine the cleaned files into one for training the tokenizer
    combine_files([eng_clean, ben_clean], combined_file)
    
    # 3. Train the SentencePiece model on the combined data
    train_sentencepiece_model(combined_file, model_prefix, vocab_size=32000)
    
    # 4. Tokenize the cleaned files using the trained SentencePiece model
    tokenize_file(eng_clean, eng_tokenized, model_file)
    tokenize_file(ben_clean, ben_tokenized, model_file)
    
    print("Preprocessing and tokenization complete.")

# if __name__ == '__main__':
#     main()

In [3]:
main()

Preprocessed eng.dev -> eng_clean.txt
Preprocessed ben.dev -> ben_clean.txt
Combined files into combined.txt
Trained SentencePiece model: spm_model.model and spm_model.vocab
Tokenized eng_clean.txt -> eng_tokenized.txt
Tokenized ben_clean.txt -> ben_tokenized.txt
Preprocessing and tokenization complete.


---

# lexicon_generator.py

- It reads parallel sentences from the two files.
- It tokenizes the sentences by splitting on whitespace (you may later improve tokenization using a more sophisticated tokenizer).
- It counts how often each English word appears and how often each English word co-occurs with each Bengali word in the same sentence pair.
- For each English word that appears frequently enough (above a threshold), it selects the Bengali word with the highest conditional probability (i.e. highest relative co-occurrence frequency) as its candidate translation.
- Finally, it writes the resulting lexicon (as English–Bengali pairs) to an output file.

In [21]:
#!/usr/bin/env python3
import collections
import math

def tokenize(text):
    """
    Simple whitespace tokenization.
    You can later replace this with a more sophisticated tokenizer if needed.
    """
    return text.strip().split()

def build_cooccurrence(eng_file, ben_file):
    """
    Reads parallel sentences from eng_file and ben_file,
    and builds frequency counts and co-occurrence counts.
    Returns:
      eng_counts: Counter for English word frequencies.
      cooc_counts: Dictionary mapping an English word to a Counter of Bengali words.
    """
    eng_counts = collections.Counter()
    cooc_counts = {}  # eng_word -> Counter({ben_word: count})
    
    with open(eng_file, 'r', encoding='utf-8') as ef, open(ben_file, 'r', encoding='utf-8') as bf:
        eng_lines = ef.readlines()
        ben_lines = bf.readlines()
    
    if len(eng_lines) != len(ben_lines):
        raise ValueError("The number of lines in the English and Bengali files must be equal.")
    
    for eng_line, ben_line in zip(eng_lines, ben_lines):
        eng_tokens = tokenize(eng_line)
        ben_tokens = tokenize(ben_line)
        
        # Update counts for each English token in this sentence
        for e in eng_tokens:
            eng_counts[e] += 1
            if e not in cooc_counts:
                cooc_counts[e] = collections.Counter()
            # Count all Bengali tokens as co-occurring with this English token
            for b in ben_tokens:
                cooc_counts[e][b] += 1
                
    return eng_counts, cooc_counts

def choose_translation(eng_counts, cooc_counts, min_count=5):
    """
    For each English word with frequency >= min_count, choose the Bengali word
    that maximizes the conditional probability P(b|e) = count(e,b)/count(e).
    Returns a dictionary mapping English word to candidate Bengali translation.
    """
    lexicon = {}
    for e, freq in eng_counts.items():
        if freq < min_count:
            continue  # skip rare words
        best_b = None
        best_prob = 0.0
        for b, cooc in cooc_counts[e].items():
            prob = cooc / freq  # conditional probability P(b|e)
            if prob > best_prob:
                best_prob = prob
                best_b = b
        if best_b is not None:
            lexicon[e] = best_b
    return lexicon

def save_lexicon(lexicon, output_file):
    """
    Writes the lexicon to output_file, one pair per line, tab-separated.
    """
    with open(output_file, 'w', encoding='utf-8') as f:
        for e, b in lexicon.items():
            f.write(f"{e}\t{b}\n")
    print(f"Lexicon saved to {output_file}")

def main():
    eng_file = "eng.dev"
    ben_file = "ben.dev"
    output_file = "auto_lexicon.txt"
    min_count = 5  # You can adjust this threshold based on your data
    
    print("Building co-occurrence counts from FLORES 101 data...")
    eng_counts, cooc_counts = build_cooccurrence(eng_file, ben_file)
    
    print("Choosing candidate translations based on conditional probabilities...")
    lexicon = choose_translation(eng_counts, cooc_counts, min_count=min_count)
    
    print(f"Generated lexicon with {len(lexicon)} entries.")
    save_lexicon(lexicon, output_file)

if __name__ == '__main__':
    main()

Building co-occurrence counts from FLORES 101 data...
Choosing candidate translations based on conditional probabilities...
Generated lexicon with 520 entries.
Lexicon saved to auto_lexicon.txt


The generated lexicon sucks, we would need to go manual, or api method.

---
# align_embeddings.py
Alignment

In [24]:
#!/usr/bin/env python3
import fasttext
import numpy as np
import matplotlib.pyplot as plt
import os
from numpy.linalg import norm, svd

def load_dictionary(dict_path):
    """
    Load a bilingual dictionary from a file.
    Each line should contain an English word and its Bengali translation separated by a tab.
    Multi-word translations are preserved.
    Returns a list of (english, bengali) pairs.
    """
    dictionary = []
    with open(dict_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t', 1)
            if len(parts) != 2:
                continue  # Skip malformed lines
            en_word = parts[0].strip()
            bn_word = parts[1].strip()
            dictionary.append((en_word, bn_word))
    return dictionary

def get_embedding_matrix(model, words):
    """
    For a list of words, return a matrix (numpy array) where each row is the embedding of that word.
    """
    embeddings = []
    for word in words:
        vector = model.get_word_vector(word)
        embeddings.append(vector)
    return np.array(embeddings)

def align_embeddings(src_model, tgt_model, dictionary):
    """
    Align embeddings from the source (English) to target (Bengali) space using Procrustes analysis.
    Returns the transformation matrix and SVD components for visualization.
    """
    src_words = [pair[0] for pair in dictionary]
    tgt_words = [pair[1] for pair in dictionary]
    
    X = get_embedding_matrix(src_model, src_words)  # English embeddings
    Y = get_embedding_matrix(tgt_model, tgt_words)  # Bengali embeddings
    
    M = np.dot(Y.T, X)  # Cross-covariance matrix
    U, S, Vt = svd(M)
    W = np.dot(U, Vt)
    return W, U, S, Vt

def cosine_similarity(a, b):
    return np.dot(a, b) / (norm(a) * norm(b) + 1e-8)

def visualize_alignment(src_model, tgt_model, dictionary, W, output_dir="visualizations"):
    """
    Visualize the alignment by plotting a histogram of cosine similarities between 
    transformed English embeddings and Bengali embeddings.
    """
    os.makedirs(output_dir, exist_ok=True)
    
    src_words = [pair[0] for pair in dictionary]
    tgt_words = [pair[1] for pair in dictionary]
    X = get_embedding_matrix(src_model, src_words)
    Y = get_embedding_matrix(tgt_model, tgt_words)
    
    X_aligned = np.dot(X, W.T)
    similarities = [cosine_similarity(X_aligned[i], Y[i]) for i in range(len(dictionary))]
    
    plt.figure(figsize=(8, 5))
    plt.hist(similarities, bins=30, color='skyblue', edgecolor='black')
    plt.xlabel("Cosine Similarity")
    plt.ylabel("Frequency")
    plt.title("Histogram of Cosine Similarities between Aligned English and Bengali Embeddings")
    hist_path = os.path.join(output_dir, "cosine_similarity_histogram.png")
    plt.savefig(hist_path)
    plt.close()
    print(f"Cosine similarity histogram saved to {hist_path}")

def visualize_singular_values(S, output_dir="visualizations"):
    """
    Plot the singular values from the SVD of the cross-covariance matrix.
    """
    os.makedirs(output_dir, exist_ok=True)
    plt.figure(figsize=(8, 5))
    plt.plot(S, marker='o', linestyle='-', color='orange')
    plt.xlabel("Index")
    plt.ylabel("Singular Value")
    plt.title("Singular Values from SVD of Cross-Covariance Matrix")
    s_path = os.path.join(output_dir, "singular_values.png")
    plt.savefig(s_path)
    plt.close()
    print(f"Singular values plot saved to {s_path}")

def load_model_safe(model_path):
    """
    Attempt to load a fastText model. If a MemoryError occurs, print a message and exit.
    """
    try:
        model = fasttext.load_model(model_path)
        return model
    except MemoryError:
        print(f"MemoryError: Unable to load {model_path}.")
        print("Consider using a smaller fastText model or running on a machine with more RAM.")
        exit(1)

def align_main():
    # Set paths for the fastText models (ensure these files exist in your working directory)
    src_model_path = 'cc.en.300.bin'
    tgt_model_path = 'cc.bn.300.bin'
    
    print("Loading fastText models...")
    src_model = load_model_safe(src_model_path)
    tgt_model = load_model_safe(tgt_model_path)
    
    # Path to your bilingual dictionary file (tab-separated)
    dict_path = 'sutra_generated_ben_eng.txt'
    dictionary = load_dictionary(dict_path)
    
    if not dictionary:
        print("Error: The bilingual dictionary is empty or not properly formatted.")
        return
    
    print(f"Loaded dictionary with {len(dictionary)} word pairs.")
    
    print("Computing alignment matrix...")
    W, U, S, Vt = align_embeddings(src_model, tgt_model, dictionary)
    
    np.save("alignment_matrix_en_to_bn.npy", W)
    print("Alignment matrix saved to alignment_matrix_en_to_bn.npy")
    
    visualize_alignment(src_model, tgt_model, dictionary, W)
    visualize_singular_values(S)

In [None]:
main()

Loading fastText model from cc.en.300.bin ...
Model loaded successfully!
Model saved to english_fasttext_model.bin
cc.bn.300.bin.gz already exists, skipping download.
cc.bn.300.bin already exists, skipping extraction.
Loading fastText model from cc.bn.300.bin ...
Model loaded successfully!
Model saved to bengali_fasttext_model.bin


In [25]:
if __name__ == '__main__':
    align_main()

Loading fastText models...
Loaded dictionary with 520 word pairs.
Computing alignment matrix...
Alignment matrix saved to alignment_matrix_en_to_bn.npy
Cosine similarity histogram saved to visualizations\cosine_similarity_histogram.png
Singular values plot saved to visualizations\singular_values.png


In [8]:
import os
src_model_path = os.path.abspath('cc.en.300.bin')
tgt_model_path = os.path.abspath('cc.bn.300.bin')
print("English model path:", src_model_path)
print("Bengali model path:", tgt_model_path)


English model path: c:\Users\TAMANG\Documents\GitHub\research-cross-lingual-translation-english-bengali\cc.en.300.bin
Bengali model path: c:\Users\TAMANG\Documents\GitHub\research-cross-lingual-translation-english-bengali\cc.bn.300.bin


In [7]:
#!/usr/bin/env python3
import os
import urllib.request
import gzip
import shutil
import fasttext

def download_and_extract(url, output_path):
    """
    Download a .gz file from a URL and extract it to the specified output_path.
    If the .gz or the extracted file already exists, skip that step.
    """
    gz_path = output_path + ".gz"
    
    # Download the gz file if it doesn't exist
    if not os.path.exists(gz_path):
        print(f"Downloading {url} to {gz_path} ...")
        urllib.request.urlretrieve(url, gz_path)
    else:
        print(f"{gz_path} already exists, skipping download.")
    
    # Extract the file if the output_path doesn't exist
    if not os.path.exists(output_path):
        print(f"Extracting {gz_path} to {output_path} ...")
        with gzip.open(gz_path, 'rb') as f_in, open(output_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    else:
        print(f"{output_path} already exists, skipping extraction.")

def save_fasttext_model(model_path, new_model_path):
    """
    Load a fastText model from model_path and save it to new_model_path.
    """
    print(f"Loading fastText model from {model_path} ...")
    model = fasttext.load_model(model_path)
    print("Model loaded successfully!")
    
    # Save the model to a new file
    model.save_model(new_model_path)
    print(f"Model saved to {new_model_path}")

def main():
    # # URL for the English fastText model (300-dimensional)
    # en_url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz"
    en_model_path = "cc.en.300.bin"  # The file after extraction
    new_en_model_path = "english_fasttext_model.bin"  # Your desired saved file name

    # # Download and extract the English model if needed
    # download_and_extract(en_url, en_model_path)
    
    # # Load and save the English model
    save_fasttext_model(en_model_path, new_en_model_path)
    
    # If you want to do the same for Bengali, uncomment and adjust the following:
    bn_url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bn.300.bin.gz"
    bn_model_path = "cc.bn.300.bin"
    new_bn_model_path = "bengali_fasttext_model.bin"
    
    download_and_extract(bn_url, bn_model_path)
    save_fasttext_model(bn_model_path, new_bn_model_path)

if __name__ == '__main__':
    main()


Loading fastText model from cc.en.300.bin ...
Model loaded successfully!
Model saved to english_fasttext_model.bin
cc.bn.300.bin.gz already exists, skipping download.
Extracting cc.bn.300.bin.gz to cc.bn.300.bin ...
Loading fastText model from cc.bn.300.bin ...
Model loaded successfully!
Model saved to bengali_fasttext_model.bin


---
# nmt_model.py
Model Definition

In [2]:
#!/usr/bin/env python3
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        # Create a matrix of shape (max_len, d_model)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        # Compute the positional encodings once in log space
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        # Change shape to (max_len, 1, d_model) so it can broadcast correctly over the batch dimension.
        pe = pe.unsqueeze(1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor of shape (seq_len, batch_size, d_model)
        Returns:
            x: Tensor with positional encoding added, shape (seq_len, batch_size, d_model)
        """
        x = x + self.pe[:x.size(0)]
        return x

class MultilingualTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=512, nhead=8, num_encoder_layers=6, 
                 num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, max_seq_length=5000):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_seq_length)
        # Using PyTorch's Transformer module (it expects inputs in shape (seq_len, batch, d_model))
        self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout)
        self.fc_out = nn.Linear(d_model, vocab_size)
        
        # Parameter for language tag bias (can be used to steer attention toward target language tokens)
        self.lang_tag_bias = nn.Parameter(torch.zeros(d_model))

    def forward(self, src, tgt, src_mask=None, tgt_mask=None, src_key_padding_mask=None, tgt_key_padding_mask=None):
        # src and tgt: shape (seq_len, batch_size)
        src_emb = self.embedding(src) * math.sqrt(self.d_model)  # (src_len, batch, d_model)
        tgt_emb = self.embedding(tgt) * math.sqrt(self.d_model)  # (tgt_len, batch, d_model)
        
        src_emb = self.pos_encoder(src_emb)
        tgt_emb = self.pos_encoder(tgt_emb)
        
        # Inject language tag bias into the embeddings (a simple way to illustrate the idea)
        src_emb = src_emb + self.lang_tag_bias
        tgt_emb = tgt_emb + self.lang_tag_bias

        # Pass through the transformer
        output = self.transformer(src_emb, tgt_emb, src_mask=src_mask, tgt_mask=tgt_mask, 
                                  src_key_padding_mask=src_key_padding_mask, 
                                  tgt_key_padding_mask=tgt_key_padding_mask)
        output = self.fc_out(output)  # (tgt_len, batch, vocab_size)
        return output

# train.py
Training Script with Logging and Visualization + Aligning Embeddings

### What This Code Does
1. Model Initialization & Device Setup:
    - It creates the MultilingualTransformer model, moves it to the proper device, and logs the status.

2. Aligned Embeddings Integration:
    - After the model is instantiated, the code:
    - Loads the pretrained fastText model.
    - Uses the load_aligned_embeddings function to compute an embedding matrix for the SentencePiece vocabulary.
    - Copies these embeddings into the model’s embedding layer and (optionally) freezes them.

3. Training Setup:
    - The code then sets up the dataset, dataloader, loss function, optimizer, and training loop. It logs loss values and saves the training loss plot and model state.

4. Running the Script:
    - Run this updated train.py script. It will integrate the aligned embeddings before starting the training process.

In [47]:
#!/usr/bin/env python3
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
# from nmt_model import MultilingualTransformer  # Ensure nmt_model.py is in the same directory
import matplotlib.pyplot as plt
import logging
import sentencepiece as spm
import numpy as np
from fasttext import load_model  # for loading the fastText model
import os

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# --------------------- Aligned Embeddings Functions --------------------- #
def load_aligned_embeddings(sp_model_file, fasttext_model, alignment_matrix_path, vocab_size, d_model):
    """
    Map tokens from your SentencePiece model to fastText embeddings,
    apply the alignment transformation, and return a weight matrix of shape (vocab_size, d_model).
    """
    sp = spm.SentencePieceProcessor(model_file=sp_model_file)
    W = np.load(alignment_matrix_path)  # Expected shape: (d_model, d_model)
    embedding_matrix = np.zeros((vocab_size, d_model))
    for i in range(vocab_size):
        token = sp.id_to_piece(i)
        vector = fasttext_model.get_word_vector(token)  # Expected shape: (300,)
        aligned_vector = np.dot(W, vector)             # Resulting shape: (300,)
        embedding_matrix[i] = aligned_vector
    return torch.tensor(embedding_matrix, dtype=torch.float)

# --------------------- Dictionary Loader --------------------- #
def load_dictionary(dict_path):
    """
    Load a bilingual dictionary from a file.
    Each line should have an English word and its Bengali translation separated by a tab.
    Multi-word translations are preserved.
    Returns a list of (english, bengali) pairs.
    """
    dictionary = []
    with open(dict_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t', 1)
            if len(parts) != 2:
                continue
            en_word = parts[0].strip()
            bn_word = parts[1].strip()
            dictionary.append((en_word, bn_word))
    return dictionary

# --------------------- Dataset and Collation --------------------- #
class TokenizedDataset(Dataset):
    def __init__(self, src_file, tgt_file, sp_model):
        with open(src_file, 'r', encoding='utf-8') as f:
            self.src_lines = f.readlines()
        with open(tgt_file, 'r', encoding='utf-8') as f:
            self.tgt_lines = f.readlines()
        assert len(self.src_lines) == len(self.tgt_lines), "Source and target files must have the same number of lines."
        self.sp_model = sp_model
    
    def __len__(self):
        return len(self.src_lines)
    
    def __getitem__(self, idx):
        src_ids = self.sp_model.encode_as_ids(self.src_lines[idx].strip())
        tgt_ids = self.sp_model.encode_as_ids(self.tgt_lines[idx].strip())
        return torch.tensor(src_ids, dtype=torch.long), torch.tensor(tgt_ids, dtype=torch.long)

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    max_src = max(len(seq) for seq in src_batch)
    max_tgt = max(len(seq) for seq in tgt_batch)
    padded_src = [torch.cat([seq, torch.zeros(max_src - len(seq), dtype=torch.long)]) for seq in src_batch]
    padded_tgt = [torch.cat([seq, torch.zeros(max_tgt - len(seq), dtype=torch.long)]) for seq in tgt_batch]
    padded_src = torch.stack(padded_src).transpose(0, 1)
    padded_tgt = torch.stack(padded_tgt).transpose(0, 1)
    return padded_src, padded_tgt

# --------------------- Inference Function --------------------- #
def greedy_decode(model, src, sp, max_len=50, start_token_id=1, end_token_id=0):
    """
    Greedy decoding for a single source sentence.
    Assumes src is a tensor of shape (seq_len, 1).
    Returns a list of token ids.
    """
    device = next(model.parameters()).device
    src = src.to(device)
    # Encode source sentence using the model's encoder
    memory = model.transformer.encoder(model.pos_encoder(model.embedding(src)))
    # Initialize ys with shape (1, batch)
    ys = torch.tensor([[start_token_id]], dtype=torch.long, device=device)
    for _ in range(max_len - 1):
        tgt_mask = model.transformer.generate_square_subsequent_mask(ys.size(0)).to(device)
        out = model.transformer.decoder(model.pos_encoder(model.embedding(ys)), memory, tgt_mask=tgt_mask)
        out = model.fc_out(out[-1, :])
        # next_word is computed with shape (batch,), then unsqueeze to (1, batch)
        next_word = torch.argmax(out, dim=-1).unsqueeze(0)
        if next_word.item() == end_token_id:
            break
        # Concatenate without unsqueezing further (so next_word remains shape (1, batch))
        ys = torch.cat([ys, next_word], dim=0)
    return ys.squeeze().tolist()

def save_sample_translations(model, sp, sample_src_file, output_file, num_samples=10):
    """
    Decode num_samples sentences from sample_src_file using greedy decoding
    and save the source and generated translations for qualitative analysis.
    """
    device = next(model.parameters()).device
    with open(sample_src_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    samples = lines[:num_samples]
    results = []
    for line in samples:
        src_ids = sp.encode_as_ids(line.strip())
        src_tensor = torch.tensor(src_ids, dtype=torch.long).unsqueeze(1)  # (seq_len, 1)
        translation_ids = greedy_decode(model, src_tensor, sp)
        translation = sp.decode_ids(translation_ids if isinstance(translation_ids, list) else [translation_ids])
        results.append((line.strip(), translation))
    
    with open(output_file, 'w', encoding='utf-8') as f:
        for src, trans in results:
            f.write("Source: " + src + "\n")
            f.write("Translation: " + trans + "\n\n")
    print(f"Sample translations saved to {output_file}")

# --------------------- Training Function --------------------- #
def train_model():
    # Hyperparameters
    vocab_size = 32000  # Must match your SentencePiece model
    d_model = 300       # Set to 300 to match fastText dimensions
    nhead = 6           # 300 is divisible by 6 (300/6 = 50)
    num_encoder_layers = 3
    num_decoder_layers = 3
    num_epochs = 10
    batch_size = 32
    learning_rate = 1e-4
    
    # Initialize model (ensure MultilingualTransformer is defined in nmt_model.py)
    model = MultilingualTransformer(vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    logging.info("Model initialized and moved to device: %s", device)
    
    # ===== Integrate Aligned Embeddings =====
    en_ft_model = load_model("english_fasttext_model.bin")
    aligned_embeds = load_aligned_embeddings("spm_model.model", en_ft_model, "alignment_matrix_en_to_bn.npy", vocab_size, d_model)
    model.embedding.weight.data.copy_(aligned_embeds)
    model.embedding.weight.requires_grad = False  # Freeze embeddings
    logging.info("Aligned embeddings integrated into the model.")
    # ===========================================
    
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    sp = spm.SentencePieceProcessor(model_file='spm_model.model')
    train_dataset = TokenizedDataset('eng_tokenized.txt', 'ben_tokenized.txt', sp)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    logging.info("Training data loaded. Total batches: %d", len(train_loader))
    
    training_losses = []
    
    for epoch in range(1, num_epochs + 1):
        model.train()
        epoch_loss = 0.0
        for batch_idx, (src, tgt) in enumerate(train_loader):
            src, tgt = src.to(device), tgt.to(device)
            tgt_input = tgt[:-1, :]   # Decoder input
            tgt_output = tgt[1:, :]    # Expected output
            
            optimizer.zero_grad()
            tgt_mask = model.transformer.generate_square_subsequent_mask(tgt_input.size(0)).to(device)
            output = model(src, tgt_input, tgt_mask=tgt_mask)
            output = output.reshape(-1, vocab_size)
            tgt_output = tgt_output.reshape(-1)
            
            loss = criterion(output, tgt_output)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
            if batch_idx % 10 == 0:
                logging.info("Epoch %d, Batch %d, Loss: %.4f", epoch, batch_idx, loss.item())
        
        avg_loss = epoch_loss / len(train_loader)
        training_losses.append(avg_loss)
        logging.info("Epoch %d complete, Average Loss: %.4f", epoch, avg_loss)
    
    torch.save(model.state_dict(), "multilingual_nmt_model.pt")
    logging.info("Model saved to multilingual_nmt_model.pt")
    
    plt.figure(figsize=(8, 5))
    plt.plot(range(1, num_epochs + 1), training_losses, marker='o')
    plt.xlabel("Epoch")
    plt.ylabel("Average Loss")
    plt.title("Training Loss Over Epochs")
    plt.savefig("training_loss.png")
    plt.close()
    logging.info("Training loss plot saved to training_loss.png")
    
    # Save sample translations for qualitative evaluation
    save_sample_translations(model, sp, 'eng_tokenized.txt', "sample_translations.txt", num_samples=10)

In [48]:
if __name__ == '__main__':
    train_model()

2025-02-16 23:40:05,366 - INFO - Model initialized and moved to device: cpu
2025-02-16 23:40:31,490 - INFO - Aligned embeddings integrated into the model.
2025-02-16 23:40:31,602 - INFO - Training data loaded. Total batches: 32
2025-02-16 23:40:48,473 - INFO - Epoch 1, Batch 0, Loss: 10.4155
2025-02-16 23:41:06,517 - INFO - Epoch 1, Batch 10, Loss: 9.2117
2025-02-16 23:41:18,064 - INFO - Epoch 1, Batch 20, Loss: 8.7462
2025-02-16 23:41:29,027 - INFO - Epoch 1, Batch 30, Loss: 8.3767
2025-02-16 23:41:29,465 - INFO - Epoch 1 complete, Average Loss: 8.9698
2025-02-16 23:41:30,536 - INFO - Epoch 2, Batch 0, Loss: 8.0627
2025-02-16 23:41:40,316 - INFO - Epoch 2, Batch 10, Loss: 7.7759
2025-02-16 23:41:50,543 - INFO - Epoch 2, Batch 20, Loss: 7.4557
2025-02-16 23:42:00,870 - INFO - Epoch 2, Batch 30, Loss: 7.4359
2025-02-16 23:42:01,044 - INFO - Epoch 2 complete, Average Loss: 7.6949
2025-02-16 23:42:02,001 - INFO - Epoch 3, Batch 0, Loss: 7.3280
2025-02-16 23:42:11,351 - INFO - Epoch 3, Bat

Sample translations saved to sample_translations.txt


In [None]:
if __name__ == '__main__':
    align_main()

Loading fastText models...
Loaded dictionary with 520 word pairs.
Computing alignment matrix...
Alignment matrix saved to alignment_matrix_en_to_bn.npy
Cosine similarity histogram saved to visualizations\cosine_similarity_histogram.png
Singular values plot saved to visualizations\singular_values.png


# test_making.py

Creating a sample test file (in order to evaluate) from flores dataset for English Bengali.

In [9]:
#!/usr/bin/env python3
def create_test_set(src_file, tgt_file, test_src_file, test_tgt_file, num_lines=1000):
    """
    Extracts the first num_lines from the source and target files and saves them as test sets.
    
    Parameters:
      src_file (str): Path to the English FLORES file.
      tgt_file (str): Path to the Bengali FLORES file.
      test_src_file (str): Output file for the test English sentences.
      test_tgt_file (str): Output file for the test Bengali sentences.
      num_lines (int): Number of sentence pairs to extract.
    """
    with open(src_file, 'r', encoding='utf-8') as f_src, open(tgt_file, 'r', encoding='utf-8') as f_tgt:
        src_lines = f_src.readlines()
        tgt_lines = f_tgt.readlines()
    
    # Check that the files have the same number of lines.
    if len(src_lines) != len(tgt_lines):
        raise ValueError("Source and target files do not have the same number of lines.")
    
    # Select the first num_lines as test set.
    test_src = src_lines[:num_lines]
    test_tgt = tgt_lines[:num_lines]
    
    with open(test_src_file, 'w', encoding='utf-8') as f_test_src:
        f_test_src.writelines(test_src)
    with open(test_tgt_file, 'w', encoding='utf-8') as f_test_tgt:
        f_test_tgt.writelines(test_tgt)
    
    print(f"Created test set with {num_lines} lines in '{test_src_file}' and '{test_tgt_file}'.")

if __name__ == '__main__':
    # Update these filenames as needed based on your FLORES dataset files.
    create_test_set('eng.dev', 'ben.dev', 'eng_test.txt', 'ben_test.txt', num_lines=1000)

Created test set with 1000 lines in 'eng_test.txt' and 'ben_test.txt'.


# eval_nmt.py

In [None]:
#!/usr/bin/env python3
import torch
import sentencepiece as spm
import sacrebleu
import logging
from fasttext import load_model
# from nmt_model import MultilingualTransformer  # Ensure this file is in the same directory
import numpy as np

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Greedy decoding function (as defined earlier)
def greedy_decode(model, src, sp, max_len=50, start_token_id=1, end_token_id=0):
    """
    Greedy decoding for a single source sentence.
    Assumes src is a tensor of shape (seq_len, 1).
    Returns a list of token ids.
    """
    device = next(model.parameters()).device
    src = src.to(device)
    memory = model.transformer.encoder(model.pos_encoder(model.embedding(src)))
    ys = torch.tensor([[start_token_id]], dtype=torch.long, device=device)
    for _ in range(max_len - 1):
        tgt_mask = model.transformer.generate_square_subsequent_mask(ys.size(0)).to(device)
        out = model.transformer.decoder(model.pos_encoder(model.embedding(ys)), memory, tgt_mask=tgt_mask)
        out = model.fc_out(out[-1, :])
        next_word = torch.argmax(out, dim=-1).unsqueeze(0)
        if next_word.item() == end_token_id:
            break
        ys = torch.cat([ys, next_word], dim=0)
    return ys.squeeze().tolist()

def load_test_data(src_path, ref_path):
    """
    Load test data from the provided paths.
    Returns a list of source sentences and a list of reference translations.
    """
    with open(src_path, 'r', encoding='utf-8') as f:
        src_sentences = [line.strip() for line in f if line.strip()]
    with open(ref_path, 'r', encoding='utf-8') as f:
        # Assumes one reference per line; sacreBLEU accepts list of references (one or more)
        ref_sentences = [line.strip() for line in f if line.strip()]
    return src_sentences, ref_sentences

def generate_translations(model, sp, src_sentences):
    """
    Generate translations for a list of source sentences using greedy decoding.
    Returns a list of generated translations (decoded as strings).
    """
    model.eval()
    translations = []
    for sentence in src_sentences:
        src_ids = sp.encode_as_ids(sentence)
        src_tensor = torch.tensor(src_ids, dtype=torch.long).unsqueeze(1)  # (seq_len, 1)
        translation_ids = greedy_decode(model, src_tensor, sp)
        translation = sp.decode_ids(translation_ids if isinstance(translation_ids, list) else [translation_ids])
        translations.append(translation)
    return translations

def evaluate_model():
    # Paths to files and models
    test_src_path = "eng_test.txt"  # Test set English sentences
    test_ref_path = "ben_test.txt"  # Corresponding reference Bengali translations
    sp_model_path = "spm_model.model"
    model_path = "multilingual_nmt_model.pt"
    
    # Hyperparameters should match those used during training
    vocab_size = 32000
    d_model = 300
    nhead = 6
    num_encoder_layers = 3
    num_decoder_layers = 3
    
    # Load the SentencePiece model
    sp = spm.SentencePieceProcessor(model_file=sp_model_path)
    
    # Initialize and load the trained model
    model = MultilingualTransformer(vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    logging.info("Trained model loaded from %s", model_path)
    
    # Load test data
    src_sentences, ref_sentences = load_test_data(test_src_path, test_ref_path)
    logging.info("Loaded %d test sentences.", len(src_sentences))
    
    # Generate translations
    generated_translations = generate_translations(model, sp, src_sentences)
    
    # Save generated translations for qualitative analysis
    with open("generated_translations.txt", "w", encoding="utf-8") as f:
        for src, trans in zip(src_sentences, generated_translations):
            f.write("Source: " + src + "\n")
            f.write("Translation: " + trans + "\n\n")
    logging.info("Generated translations saved to generated_translations.txt")
    
    # Compute BLEU score using sacreBLEU
    # sacreBLEU expects references as a list of lists
    bleu = sacrebleu.corpus_bleu(generated_translations, [ref_sentences])
    print("BLEU score:", bleu.score)
    logging.info("BLEU score: %.2f", bleu.score)

In [11]:
if __name__ == '__main__':
    evaluate_model()

2025-02-17 11:03:22,118 - INFO - Trained model loaded from multilingual_nmt_model.pt
2025-02-17 11:03:22,134 - INFO - Loaded 997 test sentences.
2025-02-17 11:11:29,028 - INFO - Generated translations saved to generated_translations.txt
2025-02-17 11:11:29,149 - INFO - BLEU score: 0.00


BLEU score: 0.0


# eval_nmt_2.py

Evaluation attempt 2, & this time:
- Updated version of your inference code that uses beam search instead of greedy decoding.

In [None]:
#!/usr/bin/env python3
import torch
import sentencepiece as spm
import sacrebleu
import logging
from fasttext import load_model
# from nmt_model import MultilingualTransformer  # Ensure this file is in the same directory
import os

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def beam_search_decode(model, src, sp, beam_width=3, max_len=50, start_token_id=1, end_token_id=0):
    """
    Beam search decoding for a single source sentence.
    Assumes src is a tensor of shape (seq_len, 1).
    Returns the best candidate sequence as a list of token ids.
    """
    device = next(model.parameters()).device
    src = src.to(device)
    memory = model.transformer.encoder(model.pos_encoder(model.embedding(src)))
    # Initialize the beam with the start token. Ensure it is a 1D tensor.
    beam = [(torch.tensor([start_token_id], dtype=torch.long, device=device), 0.0)]
    
    for _ in range(max_len - 1):
        new_beam = []
        for seq, score in beam:
            seq = seq.view(-1)  # Ensure seq is 1D
            if seq[-1].item() == end_token_id:
                new_beam.append((seq, score))
                continue
            seq_input = seq.unsqueeze(1)  # shape: (current_seq_len, 1)
            tgt_mask = model.transformer.generate_square_subsequent_mask(seq_input.size(0)).to(device)
            out = model.transformer.decoder(model.pos_encoder(model.embedding(seq_input)), memory, tgt_mask=tgt_mask)
            logits = model.fc_out(out[-1, :])  # shape: (batch, vocab_size), batch==1 so shape is (1, vocab_size)
            log_probs = torch.log_softmax(logits, dim=-1)  # shape: (1, vocab_size)
            # Get top beam_width candidates. Now topk_* will have shape (1, beam_width)
            topk_log_probs, topk_indices = torch.topk(log_probs, beam_width, dim=-1)
            # Iterate over beam_width candidates in the first row.
            for j in range(beam_width):
                candidate = topk_indices[0, j].unsqueeze(0)  # shape: (1,)
                candidate_log_prob = topk_log_probs[0, j].item()
                new_seq = torch.cat([seq, candidate], dim=0)
                new_score = score + candidate_log_prob
                new_beam.append((new_seq, new_score))
        # Keep only the top beam_width candidates
        beam = sorted(new_beam, key=lambda x: x[1], reverse=True)[:beam_width]
        if all(seq[-1].item() == end_token_id for seq, _ in beam):
            break
    best_seq, best_score = beam[0]
    return best_seq.squeeze().tolist()

def generate_translations_beam(model, sp, src_sentences, beam_width=3):
    model.eval()
    translations = []
    for sentence in src_sentences:
        src_ids = sp.encode_as_ids(sentence.strip())
        src_tensor = torch.tensor(src_ids, dtype=torch.long).unsqueeze(1)
        translation_ids = beam_search_decode(model, src_tensor, sp, beam_width=beam_width)
        translation = sp.decode_ids(translation_ids if isinstance(translation_ids, list) else [translation_ids])
        translations.append(translation)
    return translations

def load_test_data(src_path, ref_path):
    with open(src_path, 'r', encoding='utf-8') as f:
        src_sentences = [line.strip() for line in f if line.strip()]
    with open(ref_path, 'r', encoding='utf-8') as f:
        ref_sentences = [line.strip() for line in f if line.strip()]
    return src_sentences, ref_sentences

def evaluate_model():
    test_src_path = "eng_test.txt"    # Test set English sentences
    test_ref_path = "ben_test.txt"     # Corresponding Bengali references
    sp_model_path = "spm_model.model"
    model_path = "multilingual_nmt_model.pt"
    
    vocab_size = 32000
    d_model = 300
    nhead = 6
    num_encoder_layers = 3
    num_decoder_layers = 3
    
    sp = spm.SentencePieceProcessor(model_file=sp_model_path)
    model = MultilingualTransformer(vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    logging.info("Trained model loaded from %s", model_path)
    
    src_sentences, ref_sentences = load_test_data("eng_test.txt", "ben_test.txt")
    logging.info("Loaded %d test sentences.", len(src_sentences))
    
    generated_translations = generate_translations_beam(model, sp, src_sentences, beam_width=3)
    
    with open("generated_translations_beam.txt", "w", encoding="utf-8") as f:
        for src, trans in zip(src_sentences, generated_translations):
            f.write("Source: " + src + "\n")
            f.write("Translation: " + trans + "\n\n")
    logging.info("Generated translations saved to generated_translations_beam.txt")
    
    bleu = sacrebleu.corpus_bleu(generated_translations, [ref_sentences])
    logging.info("BLEU score: %.2f", bleu.score)
    print("BLEU score:", bleu.score)

if __name__ == '__main__':
    evaluate_model()

2025-02-17 11:33:02,161 - INFO - Trained model loaded from multilingual_nmt_model.pt
2025-02-17 11:33:02,165 - INFO - Loaded 997 test sentences.
