In [10]:
import pandas as pd
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from typing import Iterable, List
from transformers import AutoTokenizer
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
from torch import Tensor

In [11]:
df = pd.read_csv("data/english-newari.csv")

In [12]:
df.head()

Unnamed: 0,SN,en,new
0,,welcome to Ideax,Ideax ए लसकुस​
1,,this is a test,थो test ख​
2,,we are tongue techies,जिपि tongue techies ख​
3,,this is just a demo,थो demo जक ख​
4,,I am from Urlabari,जि उर्लाबारी च्वंम्ह


In [13]:
data = df[['en', 'new']]

In [14]:
data.head()

Unnamed: 0,en,new
0,welcome to Ideax,Ideax ए लसकुस​
1,this is a test,थो test ख​
2,we are tongue techies,जिपि tongue techies ख​
3,this is just a demo,थो demo जक ख​
4,I am from Urlabari,जि उर्लाबारी च्वंम्ह


In [15]:
data.astype(str)

Unnamed: 0,en,new
0,welcome to Ideax,Ideax ए लसकुस​
1,this is a test,थो test ख​
2,we are tongue techies,जिपि tongue techies ख​
3,this is just a demo,थो demo जक ख​
4,I am from Urlabari,जि उर्लाबारी च्वंम्ह
...,...,...
1006,lets go to see Indra Jatra,नु ईन्द्रजात्रा स्वो वोने ।
1007,Lakhe,लशिँ – पुलुकिशि
1008,You are like a lakhe,छ लाखे थें चोँ ।
1009,Is your beloved well?,छिमी यज्जु म्ह फु ला ?


In [16]:
import re
import string

def preprocessing(df):
    def process_text(text):
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = text.strip()
        text = re.sub("\s+", " ", text)
        return text

    df["en"] = df["en"].apply(process_text)
    
    def clean_text(text):
        text = re.sub(r'[०-९]', '', text)
        text = re.sub(r'[()#/@;:<>‘+=।?!|,’‘’]', '', text)
        text = text.strip()
        return text

    df["new"] = df["new"].apply(clean_text)    
    return df
data = preprocessing(data)
data.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["en"] = df["en"].apply(process_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["new"] = df["new"].apply(clean_text)


Unnamed: 0,en,new
0,welcome to ideax,Ideax ए लसकुस​
1,this is a test,थो test ख​
2,we are tongue techies,जिपि tongue techies ख​
3,this is just a demo,थो demo जक ख​
4,i am from urlabari,जि उर्लाबारी च्वंम्ह
5,i am from damak,जि दमक च्वंम्ह
6,i am from birtamod,जि बिर्तामोड च्वंम्ह
7,my name is nishant,जिगू नां निशान्त खः
8,my name is drishya,जिगू नां दृश्य खः
9,hello my name is sushan,ज्वजलपा जिगू नां सुशन खः


In [17]:
tokenizer_nepali = AutoTokenizer.from_pretrained('sakonii/deberta-base-nepali')

def newari_tokenizer(sentence):
    tokens = tokenizer_nepali.tokenize(sentence)
    return tokens

SRC_LANGUAGE = 'en'
TGT_LANGUAGE = 'new'

token_transform = {}
vocab_transform = {}

token_transform[SRC_LANGUAGE] = get_tokenizer('basic_english')
token_transform[TGT_LANGUAGE] = get_tokenizer(newari_tokenizer)

def yield_tokens(data_iter: Iterable, language: str) -> List[str]:    
    for index,data_sample in data_iter:
        yield token_transform[language](data_sample[language])

UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    train_iter = df.iterrows()
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    vocab_transform[ln].set_default_index(UNK_IDX)

source_vocab = vocab_transform[SRC_LANGUAGE]
target_vocab = vocab_transform[TGT_LANGUAGE]
torch.save(source_vocab, 'vocabs/english_vocab.pth')
torch.save(target_vocab, 'vocabs/newari_vocab.pth')

In [19]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [20]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [21]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

