In [1]:
import re
from collections import Counter

import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence


In [2]:
def clean_sentence(sent):
    sent = sent.lower().strip()
    sent = re.sub(r"[^a-zA-Zँ-९\s]", "", sent)  # keep Nepali Devanagari + Latin
    return sent

In [3]:
def load_dataset(path):
    eng_sentences = []
    nep_sentences = []

    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) >= 2:
                eng= clean_sentence(parts[0])
                nep = clean_sentence(parts[1])
                eng_sentences.append(eng)
                nep_sentences.append(nep)

    return eng_sentences, nep_sentences

In [5]:
eng_sentences, nep_sentences = load_dataset("data/npi.txt")

In [6]:
for i in range(5):
    print(f"Eng: {eng_sentences[i]}")
    print(f"Nep: {nep_sentences[i]}\n")

Eng: who
Nep: को

Eng: hide
Nep: लुकाउनुहोस्।

Eng: hide
Nep: लुक।

Eng: stay
Nep: बस्नुहोस्।

Eng: hello
Nep: नमस्ते



In [7]:
#Tokenize
def tokenize(sentence):
    return sentence.split()

eng_tokens = [tokenize(sent) for sent in eng_sentences]
nep_tokens = [tokenize(sent) for sent in nep_sentences]

In [8]:
#check the tokens
for i in range(5):
    print(f"Eng tokens: {eng_tokens[i]}")
    print(f"Nep tokens: {nep_tokens[i]}\n")

Eng tokens: ['who']
Nep tokens: ['को']

Eng tokens: ['hide']
Nep tokens: ['लुकाउनुहोस्।']

Eng tokens: ['hide']
Nep tokens: ['लुक।']

Eng tokens: ['stay']
Nep tokens: ['बस्नुहोस्।']

Eng tokens: ['hello']
Nep tokens: ['नमस्ते']



In [9]:
#Build the vocabulary

PAD_TOKEN = "<PAD>"
SOS_TOKEN = "<SOS>"
EOS_TOKEN = "<EOS>"
UNK_TOKEN = "<UNK>"

def build_vocab(token_lists, min_freq=1):
    counter = Counter(token for tokens in token_lists for token in tokens)
    vocab = {PAD_TOKEN: 0, SOS_TOKEN: 1, EOS_TOKEN: 2, UNK_TOKEN: 3}
    for token, freq in counter.items():
        if freq >= min_freq:
            vocab[token] = len(vocab)
    return vocab

eng_vocab = build_vocab(eng_tokens)
nep_vocab = build_vocab(nep_tokens)

# Reverse vocab for decoding later
inv_eng_vocab = {i: w for w, i in eng_vocab.items()}
inv_nep_vocab = {i: w for w, i in nep_vocab.items()}

In [10]:
print("Sample English vocab:", list(eng_vocab.items())[:10])
print("Sample Nepali vocab:", list(nep_vocab.items())[:10])

Sample English vocab: [('<PAD>', 0), ('<SOS>', 1), ('<EOS>', 2), ('<UNK>', 3), ('who', 4), ('hide', 5), ('stay', 6), ('hello', 7), ('smile', 8), ('attack', 9)]
Sample Nepali vocab: [('<PAD>', 0), ('<SOS>', 1), ('<EOS>', 2), ('<UNK>', 3), ('को', 4), ('लुकाउनुहोस्।', 5), ('लुक।', 6), ('बस्नुहोस्।', 7), ('नमस्ते', 8), ('मुस्कान।', 9)]


In [11]:
def sentence_to_ids(tokens, vocab):
    return [vocab.get(token, vocab[UNK_TOKEN]) for token in tokens]

def wrap_sos_eos(ids, vocab):
    return [vocab[SOS_TOKEN]] + ids + [vocab[EOS_TOKEN]]

In [12]:
sample_eng_ids = wrap_sos_eos(sentence_to_ids(eng_tokens[0], eng_vocab), eng_vocab)
sample_nep_ids = wrap_sos_eos(sentence_to_ids(nep_tokens[0], nep_vocab), nep_vocab)

print("Original English:", eng_tokens[0])
print("Token IDs:", sample_eng_ids)
print("Back to words:", [inv_eng_vocab[i] for i in sample_eng_ids])


Original English: ['who']
Token IDs: [1, 4, 2]
Back to words: ['<SOS>', 'who', '<EOS>']
