## Motivation for Attention

In [1]:
import numpy as np 
import pandas as pd 
import string
from string import digits 
import re 
import torch
from torch import nn 
import torch.nn.functional as nnfunc 
import math 
import contractions 
import gc

In [2]:
# What version of Python do you have?
import sys 
import platform
import sklearn as sk 
from torch.utils.data import Dataset, DataLoader
print(f"python Platform: {platform.platform()}")
print(f"PyTorch Version: {torch.__version__}")
print(f"Python: {sys.version}")
print(f"Pandas: {pd.__version__}")
print(f"Scikit-Learn: {sk.__version__}")

python Platform: macOS-15.4-arm64-arm-64bit
PyTorch Version: 2.6.0
Python: 3.11.7 (main, Dec 15 2023, 12:09:56) [Clang 14.0.6 ]
Pandas: 2.2.3
Scikit-Learn: 1.6.1


In [3]:
def get_device():
    has_gpu = torch.cuda.is_available()
    has_mps = torch.backends.mps.is_built()
    print ("NVIDIA/CUDA GPU is", "available" if has_gpu else "NOT AVAILABLE")
    print("MPS (Apple Metal) is", "AVAILABLE" if has_mps else "NOT AVAILABLE")
    return torch.device('mps') if has_mps else torch.device('cuda') if has_gpu else torch.device('cpu')
device = get_device()
print(f"Target device is {device}")

NVIDIA/CUDA GPU is NOT AVAILABLE
MPS (Apple Metal) is AVAILABLE
Target device is mps


In [4]:
def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k) 
    if mask is not None:
        scaled = scaled.permute(1, 0, 2, 3) + mask
        scaled = scaled.permute(1, 0, 2, 3)
    attention = nnfunc.softmax(scaled, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention

In [5]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model
        
    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_i/self.d_model)
        position = (torch.arange(self.max_sequence_length).reshape(self.max_sequence_length, 1))
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        return PE

In [6]:
class SentenceEmbedding(nn.Module):
    "For a given sentence, create an embedding" 
    def __init__(self, max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN):
        super().__init__()
        self.vocab_size = len(language_to_index)
        self.max_sequence_length = max_sequence_length
        self.embedding = nn.Embedding (self.vocab_size, d_model)
        self.language_to_index = language_to_index
        self.position_encoder = PositionalEncoding(d_model, max_sequence_length)
        self.dropout = nn.Dropout(p=0.1)
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN
        print(self.embedding.weight.shape)

    def batch_tokenize(self, batch, start_token, end_token):
        def tokenize(sentence, start_token, end_token):
            sentence_word_indices = []
            for token in sentence.split():
                if token in self.language_to_index:
                    sentence_word_indices.append(self.language_to_index[token])
            if start_token:
                sentence_word_indices.insert(0, self.language_to_index[self.START_TOKEN])
            if end_token:
                sentence_word_indices.append(self.language_to_index[self.END_TOKEN]) 
            for _ in range(len(sentence_word_indices), self.max_sequence_length):
                sentence_word_indices.append(self.language_to_index[self.PADDING_TOKEN])
            return torch.tensor(sentence_word_indices)
            
        tokenized = []
        for sentence_num in range(len(batch)):
            tokenized.append(tokenize(batch[sentence_num], start_token, end_token))
        tokenized = torch.stack(tokenized)
        return tokenized.to(device)
            
    def forward(self, x, start_token, end_token): # sentence
        x = self.batch_tokenize(x, start_token, end_token)
        x = self.embedding(x)
        pos = self.position_encoder().to(device)
        x = self.dropout(x + pos) 
        return x

In [7]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = nn.Linear(d_model, 3 * d_model)
        self.linear_layer = nn.Linear(d_model, d_model)
        
    def forward(self, x, mask):
        batch_size, sequence_length, d_model = x.size()
        qkv = self.qkv_layer(x)
        qkv = qkv.reshape(batch_size, sequence_length, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)
        q, k, v = qkv.chunk(3, dim=-1)
        values, attention = scaled_dot_product(q, k, v, mask)
        values = values.permute(0, 2, 1, 3).reshape(batch_size, sequence_length, self.num_heads * self.head_dim)
        out = self.linear_layer(values)
        return out

In [8]:
class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape=parameters_shape 
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta = nn.Parameter(torch.zeros(parameters_shape))
    
    def forward(self, inputs):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        y = (inputs - mean) / std
        out = self.gamma * y + self.beta
        return out

In [9]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear (hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)
    
    def forward(self, x) :
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [10]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm1 = LayerNormalization(parameters_shape=[d_model])
        self.dropout1 = nn.Dropout(p=drop_prob)
        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = LayerNormalization(parameters_shape= [d_model])
        self.dropout2 = nn.Dropout(p=drop_prob)
        
    def forward(self, x, self_attention_mask):
        residual_x = x.clone()
        x = self.attention(x, mask=self_attention_mask)
        x = self.dropout1(x)
        x = self.norm1(x + residual_x)
        residual_x = x.clone()
        x = self.ffn (x)
        x = self.dropout2(x)
        x = self.norm2(x + residual_x)
        return x

In [11]:
class SequentialEncoder(nn.Sequential):
    def forward(self, *inputs):
        x, self_attention_mask = inputs
        for module in self._modules.values():
            x = module(x, self_attention_mask)
        return x

In [12]:
class Encoder(nn.Module) :
    def __init__(self, 
                 d_model, 
                 ffn_hidden, 
                 num_heads, 
                 drop_prob, 
                 num_layers, 
                 max_sequence_length, 
                 language_to_index,
                 START_TOKEN, 
                 END_TOKEN,
                 PADDING_TOKEN) :
        super().__init__()
        self.sentence_embedding = SentenceEmbedding(max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.layers = SequentialEncoder(*[EncoderLayer(d_model, ffn_hidden, num_heads, drop_prob)
                                          for _ in range (num_layers)])
    
    def forward(self, x, self_attention_mask, start_token, end_token):
        x = self.sentence_embedding(x, start_token, end_token)
        x = self.layers(x, self_attention_mask)
        return x

In [13]:
class MultiHeadCrossAttention(nn.Module):
    def __init__(self, d_model, num_heads) :
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.kv_layer = nn.Linear(d_model, 2 * d_model)
        self.q_layer = nn.Linear (d_model, d_model)
        self.linear_layer = nn.Linear(d_model, d_model)
    
    def forward(self, x, y, mask):
        batch_size, sequence_length, d_model = x.size() # in practice, this is the same for both languages...
        kv = self.kv_layer(x)
        q = self.q_layer(y)
        kv = kv.reshape(batch_size, sequence_length, self.num_heads, 2 * self.head_dim)
        q = q.reshape(batch_size, sequence_length, self.num_heads, self.head_dim)
        kv = kv.permute(0, 2, 1, 3)
        q = q.permute(0, 2, 1, 3)
        k, v = kv.chunk(2, dim=-1)
        values, attention = scaled_dot_product(q, k, v, mask) # We don't need the mask for cross attention, removing in outer function!
        values = values.permute(0, 2, 1, 3).reshape(batch_size, sequence_length, d_model)
        out = self.linear_layer(values)
        return out

In [14]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.layer_norm1 = LayerNormalization(parameters_shape=[d_model])
        self.dropout1 = nn.Dropout(p=drop_prob)
        
        self.encoder_decoder_attention = MultiHeadCrossAttention(d_model=d_model, num_heads=num_heads)
        self.layer_norm2 = LayerNormalization(parameters_shape=[d_model])
        self.dropout2 = nn.Dropout(p=drop_prob)
        
        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.layer_norm3 = LayerNormalization(parameters_shape=[d_model])
        self.dropout3 = nn.Dropout(p=drop_prob)
    
    def forward(self, x, y, self_attention_mask, cross_attention_mask):
        _y = y.clone()
        y = self.self_attention(y, mask=self_attention_mask)
        y = self.dropout1(y)
        y = self.layer_norm1(y + _y)
        
        _y = y.clone()
        y = self.encoder_decoder_attention(x, y, mask=cross_attention_mask)
        y = self.dropout2(y)
        y = self.layer_norm2(y + _y)
        
        _y = y.clone()
        y = self.ffn(y)
        y = self.dropout3(y)
        y = self.layer_norm3(y + _y)
        return y

In [15]:
class SequentialDecoder(nn.Sequential):
    def forward (self, *inputs):
        x, y, self_attention_mask, cross_attention_mask = inputs
        for module in self._modules.values():
            y = module(x, y, self_attention_mask, cross_attention_mask)
        return y

In [16]:
class Decoder(nn.Module): 
    def __init__(self, 
                 d_model, 
                 ffn_hidden,
                 num_heads,
                 drop_prob,
                 num_layers,
                 max_sequence_length, 
                 language_to_index,
                 START_TOKEN, 
                 END_TOKEN, 
                 PADDING_TOKEN):
        super().__init__()
        self.sentence_embedding = SentenceEmbedding(max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.layers = SequentialDecoder(*[DecoderLayer(d_model, ffn_hidden, num_heads, drop_prob) 
                                          for _ in range(num_layers)])
    
    def forward(self, x, y, self_attention_mask, cross_attention_mask, start_token, end_token) :
        y = self.sentence_embedding (y, start_token, end_token)
        y = self.layers(x, y, self_attention_mask, cross_attention_mask)
        return y

In [17]:
class Transformer(nn.Module): 
    def __init__(self, 
                 d_model, 
                 ffn_hidden,
                 num_heads,
                 drop_prob,
                 num_layers,
                 max_sequence_length, 
                 hn_vocab_size,
                 english_to_index, 
                 hindi_to_index,
                 START_TOKEN, 
                 END_TOKEN, 
                 PADDING_TOKEN):

        super().__init__()
        self.encoder = Encoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length, english_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.decoder = Decoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length, hindi_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.linear = nn.Linear(d_model, hn_vocab_size)
        self.device = device
        
    def forward(self,
                x,
                y, 
                encoder_self_attention_mask=None, 
                decoder_self_attention_mask=None, 
                decoder_cross_attention_mask=None, 
                enc_start_token=False, 
                enc_end_token=False,
                dec_start_token=False, 
                dec_end_token=False):
    
        # Pass encoder outputs and decoder inputs to the decoder
        x = self.encoder(x, encoder_self_attention_mask, start_token=enc_start_token, end_token=enc_end_token)
        out = self.decoder(x, y, decoder_self_attention_mask, decoder_cross_attention_mask, start_token=dec_start_token, end_token=dec_end_token)
        out = self.linear(out) # Generate final predictions
        return out


## Defining Model

In [18]:
# Define parameters
d_model = 256
batch_size = 50
ffn_hidden = 2048
num_heads = 8
drop_prob = 0.1
num_layers = 5
max_sequence_length = 150 #I've checked it in exploratary analysis-> 90 is ideal

## Preparing Data

### Read hindi file clean a bit and write it down again.

In [19]:
base_path = '/Users/siddharthchaudhary/Documents/git_repos/AI-Models/Transformer_from_scratch/Dataset/'

In [20]:
hindi_english_data_path =  base_path + 'Dataset_English_Hindi.csv'
train_en_file = base_path + 'vocab_and_training_files/train_en.txt'
train_hn_file = base_path + 'vocab_and_training_files/train_hn.txt'
english_vocab_file = base_path + 'vocab_and_training_files/english_vocab.txt'
hindi_vocab_file = base_path + 'vocab_and_training_files/hindi_vocab.txt'

In [21]:
lines = (pd. read_csv(hindi_english_data_path).rename (columns = {'English': 'english', 'Hindi': 'hindi'}))
# lines = lines [:1000000]
lines = lines[:10000]
lines.head()

Unnamed: 0,english,hindi
0,Help!,बचाओ!
1,Jump.,उछलो.
2,Jump.,कूदो.
3,Jump.,छलांग.
4,Hello!,नमस्ते।


In [22]:
lines.tail()

Unnamed: 0,english,hindi
9995,That involves vast and revolutionary changes i...,इसके लिए हमारे राजनैतिक और सामाजिक ढांचे में व...
9996,When women is on top.,जब महिला उपर हो
9997,A cricket match can be won by a team by making...,क्रिकेट में खेल को ज्यादा रन बना कर भी जीता जा...
9998,Earth outer surface is divided into several ri...,पृथ्वी की बाहरी सतह (outer surface) कई कठोर खं...
9999,Gumbhan: Aukar Joshi's Indian language word-pr...,गमभन : ओंकार जोशी का भारतीय भाषा शब्द-संसाधक (...


In [23]:
missing_data = lines.isnull().sum().to_frame().rename(columns={0:"Total No. of Missing Values"})
missing_data["% of Missing Values"] = round((missing_data["Total No. of Missing Values"]/len(lines))*100,2)

In [24]:
round(lines.describe().T,2)

Unnamed: 0,count,unique,top,freq
english,10000,9702,(Laughter),39
hindi,9982,9626,(हँसी),15


In [25]:
lines.drop_duplicates(inplace = True)

In [26]:
lines = lines.dropna(subset=["hindi", "english"])
lines.head()

Unnamed: 0,english,hindi
0,Help!,बचाओ!
1,Jump.,उछलो.
2,Jump.,कूदो.
3,Jump.,छलांग.
4,Hello!,नमस्ते।


In [27]:
lines.shape

(9916, 2)

In [28]:
# Lowercase all characters
lines['english'] = lines['english'].str.lower()

In [29]:
# Remove HTML
def remove_html_tags(text):
    # Corrected pattern: Only allows alphanumeric characters and spaces
    pattern = r'[^a-zA-Z0-9\s]'
    text = re.sub(pattern, '', text)
    return text


lines['english'] = lines['english'].apply(remove_html_tags)

In [30]:
# remove URLS
def remove_url(text):
    # Correct regex to match URLs
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub('', text)
    
lines['english'] = lines['english'].apply(remove_url)
lines['hindi'] = lines['hindi'].apply(remove_url)
lines.head()

Unnamed: 0,english,hindi
0,help,बचाओ!
1,jump,उछलो.
2,jump,कूदो.
3,jump,छलांग.
4,hello,नमस्ते।


In [31]:
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "А3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAO": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "NRN": "No Reply Necessary",
    "OIC":"Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN" : "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WUF": "Where Are You From?",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "LOL": "Laughing out loud",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don't care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "IDC": "I don't care",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "LMAO": "Laughing my a** off",
    "BFF": "Best friends forever",
    "CSL": "Can't stop laughing"
}

In [32]:
def chat_conversion(text):
    new_text=[]
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

lines['english'] = lines['english'].apply(chat_conversion)

In [33]:
lines.head()

Unnamed: 0,english,hindi
0,help,बचाओ!
1,jump,उछलो.
2,jump,कूदो.
3,jump,छलांग.
4,hello,नमस्ते।


In [34]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F" 
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF" 
                               u"\U0001F1E0-\U0001F1FF"
                               u"\U00002702-\U000027B0" 
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r'', text)

In [35]:
lines.head()

Unnamed: 0,english,hindi
0,help,बचाओ!
1,jump,उछलो.
2,jump,कूदो.
3,jump,छलांग.
4,hello,नमस्ते।


In [36]:
lines.shape

(9916, 2)

In [37]:
lines['english'] = lines['english'].apply(remove_emoji)
lines['hindi'] = lines['hindi'].apply(remove_emoji)

In [38]:
# Contraction:- y're → you are 
def expand_contractions (text):
    expanded_text = contractions.fix(text)
    return expanded_text
    
lines['english'] = lines['english'].apply(expand_contractions)
lines.head()

Unnamed: 0,english,hindi
0,help,बचाओ!
1,jump,उछलो.
2,jump,कूदो.
3,jump,छलांग.
4,hello,नमस्ते।


In [39]:
# Remove Non-Hindi and Alphanumeric characters
def preprocess_text(text, language='english'):
    if not isinstance(text, str):
        return text  
    if language == 'english':
        pattern = re. compile(r'[^a-zA-Z0-9\s]')
        return pattern.sub(r'', text)
    elif language == 'hindi':
        pattern = re. compile(r'[^\u0900-\u097F\s]')
        return pattern. sub(r'', text)
    else:
        raise ValueError ("Unsupported Language, Supported languages are 'english' and 'hindi'")

lines['english'] = lines['english'].apply(lambda x: preprocess_text(x, language='english'))
lines['hindi'] = lines['hindi'].apply(lambda x: preprocess_text(x, language='hindi'))

In [40]:
# Get the number of words count in each sentence for hindi and english
lines['length_eng']=lines['english'].apply(lambda x: len(x.split(" ")) )
lines['length_hin']=lines['hindi'].apply(lambda x: len(x.split(" ")))

In [41]:
# Function to check if sentence containg Hindi alphabets in the English column
def remove_hindi_alphabets (text) :
    return re.sub("[\u0900-\u097F]", "", text)
    
# Apply the function to the English column
lines['english'] = lines['english'].apply(remove_hindi_alphabets)
lines = lines.dropna(subset= ["hindi", "english"])
lines

Unnamed: 0,english,hindi,length_eng,length_hin
0,help,बचाओ,1,1
1,jump,उछलो,1,1
2,jump,कूदो,1,1
3,jump,छलांग,1,1
4,hello,नमस्ते।,1,1
...,...,...,...,...
9995,that involves vast and revolutionary changes i...,इसके लिए हमारे राजनैतिक और सामाजिक ढांचे में व...,31,54
9996,when women is on top,जब महिला उपर हो,5,4
9997,a cricket match can be won by a team by making...,क्रिकेट में खेल को ज्यादा रन बना कर भी जीता जा...,31,25
9998,earth outer surface is divided into several ri...,पृथ्वी की बाहरी सतह कई कठोर खंडों या विवर्तन...,102,50


In [42]:
# Remove extra spaces
lines['english']=lines['english'].apply(lambda x: x.strip())
lines['hindi']=lines['hindi'].apply(lambda x: x.strip ())

In [43]:
lines[lines['length_eng'] > 150].shape

(3, 4)

In [44]:
lines[lines['length_hin'] > 150].shape

(8, 4)

In [45]:
lines = lines[lines['length_eng']<=150]
lines = lines[lines['length_hin']<=150]

In [46]:
lines.shape

(9907, 4)

In [47]:
print("maximum length of Hindi Sentence ", max(lines ['length_hin'])) 
print("maximum length of English Sentence ", max (lines ['length_eng']))

maximum length of Hindi Sentence  149
maximum length of English Sentence  150


In [48]:
lines.head()

Unnamed: 0,english,hindi,length_eng,length_hin
0,help,बचाओ,1,1
1,jump,उछलो,1,1
2,jump,कूदो,1,1
3,jump,छलांग,1,1
4,hello,नमस्ते।,1,1


In [49]:
hindi_sentences = lines['hindi'].values.tolist()
english_sentences = lines['english'].values.tolist()

In [50]:
### Get English and Hindi Vocabulary
all_eng_words = set()
for eng in lines['english']:
    all_eng_words.update(eng.split())

all_hindi_words = set ()
for hin in lines['hindi']:
    all_hindi_words.update(hin.split())

In [51]:
hindi_vocab = list(all_hindi_words)
print(len(hindi_vocab))
english_vocab = list(all_eng_words)
print(len(english_vocab))

18298
15841


### Now Writing the hindi, english text and vocab created in above 3 cells

In [52]:
file = open(train_en_file,'w')
for sentence in english_sentences:
    file.write(sentence+"\n")
file.close()

In [53]:
file = open(train_hn_file,'w')
for sentence in hindi_sentences:
    file.write(sentence+"\n")
file.close()

In [54]:
file = open(english_vocab_file, 'w')
for word in english_vocab: 
    file.write(word+"\n")
file.close()

In [55]:
file = open(hindi_vocab_file, 'w')
for word in hindi_vocab:
    file.write (word+"\n")
file.close()

In [56]:
# Delete dataframes to release memory
del lines 
del all_eng_words 
del all_hindi_words 
del hindi_sentences 
del english_sentences 
del hindi_vocab 
del english_vocab
# Run garbage collector
gc.collect()

0

### Reading the hindi, english text and vocab

In [57]:
with open(train_en_file, 'r') as file:
    english_sentences = file.readlines()
english_sentences = [sentence.rstrip('\n') for sentence in english_sentences]
english_sentences[:10]

['help',
 'jump',
 'jump',
 'jump',
 'hello',
 'hello',
 'cheers',
 'cheers',
 'got it',
 'i am ok']

In [58]:
with open(train_hn_file, 'r') as file:
    hindi_sentences = file.readlines()
hindi_sentences = [sentence.rstrip('\n') for sentence in hindi_sentences]
hindi_sentences[: 10]

['बचाओ',
 'उछलो',
 'कूदो',
 'छलांग',
 'नमस्ते।',
 'नमस्कार।',
 'वाहवाह',
 'चियर्स',
 'समझे कि नहीं',
 'मैं ठीक हूँ।']

In [59]:
with open(hindi_vocab_file, 'r') as file:
    hindi_vocab = file.readlines ()
hindi_vocab = [word.rstrip('\n') for word in hindi_vocab]
print(len(hindi_vocab) )
print(hindi_vocab)

18298
['हवाईजहाज़', 'पुस्तकें', 'क्रिसेंट', 'शोधसमूह', 'अभिन्न', 'कर्णकूट', 'गुजरी', 'हैंसेल', 'प्रत्याशी', 'बल्लेबाज', 'वेबवार्ता', 'इच्छामती', 'रूपाकार', 'साम्यवादी', 'हाथः', '१६००', 'चोला', 'नागपुर', 'युद्धग्रस्त', 'साधारणत', 'खिताब', 'आर्गनिक', 'मेजबान', 'ज़ारी', 'गिरनी', 'रनो', 'शोले', 'समाविष्ट', 'भागवती', 'पेशकश', 'अवलोकन', 'एपीजे', 'आज़ादी', 'इज़्जत', 'स्थगन', 'गुणों', 'रेसियल', 'अंधेर', 'चमत्कार', 'कटाई', 'टोरी', 'ग़लतफ़ैमी', 'सलहमशविरा', 'सुलझाने', 'जाद', 'कुरआन', 'अहमियत', 'डेनिश', 'पंत', 'टिहरी', 'प्रजातियां', 'कुडई', 'हुमायुं', 'एकदूसरे', 'कॉण्ट्रैक्ट', 'बताया।', 'पहाड़ी', 'अमोनिया', 'ञुनेरल्', 'क्रमांक', 'प्रांतो', 'चर्चाओं', 'संवेदनशील', 'भार', 'सट्', 'बनेगा', 'पत्तनों', 'रसायन', 'भुलने', 'रह', 'अखबारों', 'एमी', 'बुनें', 'धुए', 'बधाईयाँ।', '६३२', 'अधिरथ', 'अन्योन्य', 'आन्दोलन', 'दूतावास', 'रक्षात्मक', 'आयोजित', 'छोड़ूँ', 'बहुदेववाद', 'गुच्छे', 'नाभा', 'नौकरियां', 'विकृत', 'चोगा', 'सॉफ्टवेर', 'ललीतपुर', 'वैश्विक', 'नववर्ष', 'गफलत', 'उसकाउसकी', 'चाहिएफर्श', 'नवीकरणीय', 'नी

In [60]:
with open (english_vocab_file, 'r') as file:
    english_vocab= file.readlines()
english_vocab = [word.rstrip('\n') for word in english_vocab]
print(len(english_vocab))
print(english_vocab)

15841


In [61]:
START_TOKEN = '<START>'
PADDING_TOKEN = '<PADDING>'
END_TOKEN = '<END>'

In [62]:
hindi_vocab.insert(0, START_TOKEN)
hindi_vocab.append(PADDING_TOKEN)
hindi_vocab.append(END_TOKEN)
print(hindi_vocab)

['<START>', 'हवाईजहाज़', 'पुस्तकें', 'क्रिसेंट', 'शोधसमूह', 'अभिन्न', 'कर्णकूट', 'गुजरी', 'हैंसेल', 'प्रत्याशी', 'बल्लेबाज', 'वेबवार्ता', 'इच्छामती', 'रूपाकार', 'साम्यवादी', 'हाथः', '१६००', 'चोला', 'नागपुर', 'युद्धग्रस्त', 'साधारणत', 'खिताब', 'आर्गनिक', 'मेजबान', 'ज़ारी', 'गिरनी', 'रनो', 'शोले', 'समाविष्ट', 'भागवती', 'पेशकश', 'अवलोकन', 'एपीजे', 'आज़ादी', 'इज़्जत', 'स्थगन', 'गुणों', 'रेसियल', 'अंधेर', 'चमत्कार', 'कटाई', 'टोरी', 'ग़लतफ़ैमी', 'सलहमशविरा', 'सुलझाने', 'जाद', 'कुरआन', 'अहमियत', 'डेनिश', 'पंत', 'टिहरी', 'प्रजातियां', 'कुडई', 'हुमायुं', 'एकदूसरे', 'कॉण्ट्रैक्ट', 'बताया।', 'पहाड़ी', 'अमोनिया', 'ञुनेरल्', 'क्रमांक', 'प्रांतो', 'चर्चाओं', 'संवेदनशील', 'भार', 'सट्', 'बनेगा', 'पत्तनों', 'रसायन', 'भुलने', 'रह', 'अखबारों', 'एमी', 'बुनें', 'धुए', 'बधाईयाँ।', '६३२', 'अधिरथ', 'अन्योन्य', 'आन्दोलन', 'दूतावास', 'रक्षात्मक', 'आयोजित', 'छोड़ूँ', 'बहुदेववाद', 'गुच्छे', 'नाभा', 'नौकरियां', 'विकृत', 'चोगा', 'सॉफ्टवेर', 'ललीतपुर', 'वैश्विक', 'नववर्ष', 'गफलत', 'उसकाउसकी', 'चाहिएफर्श', 'नवीकरणीय'

In [63]:
english_vocab.insert(0, START_TOKEN) 
english_vocab.append(PADDING_TOKEN)
english_vocab.append(END_TOKEN)
print(english_vocab)



In [64]:
index_to_hindi = {k:v for k,v in enumerate(hindi_vocab)}
hindi_to_index = {v:k for k,v in enumerate (hindi_vocab)}
index_to_english = {k:v for k,v in enumerate (english_vocab)}
english_to_index = {v:k for k,v in enumerate(english_vocab)}

In [65]:
index_to_hindi

{0: '<START>',
 1: 'हवाईजहाज़',
 2: 'पुस्तकें',
 3: 'क्रिसेंट',
 4: 'शोधसमूह',
 5: 'अभिन्न',
 6: 'कर्णकूट',
 7: 'गुजरी',
 8: 'हैंसेल',
 9: 'प्रत्याशी',
 10: 'बल्लेबाज',
 11: 'वेबवार्ता',
 12: 'इच्छामती',
 13: 'रूपाकार',
 14: 'साम्यवादी',
 15: 'हाथः',
 16: '१६००',
 17: 'चोला',
 18: 'नागपुर',
 19: 'युद्धग्रस्त',
 20: 'साधारणत',
 21: 'खिताब',
 22: 'आर्गनिक',
 23: 'मेजबान',
 24: 'ज़ारी',
 25: 'गिरनी',
 26: 'रनो',
 27: 'शोले',
 28: 'समाविष्ट',
 29: 'भागवती',
 30: 'पेशकश',
 31: 'अवलोकन',
 32: 'एपीजे',
 33: 'आज़ादी',
 34: 'इज़्जत',
 35: 'स्थगन',
 36: 'गुणों',
 37: 'रेसियल',
 38: 'अंधेर',
 39: 'चमत्कार',
 40: 'कटाई',
 41: 'टोरी',
 42: 'ग़लतफ़ैमी',
 43: 'सलहमशविरा',
 44: 'सुलझाने',
 45: 'जाद',
 46: 'कुरआन',
 47: 'अहमियत',
 48: 'डेनिश',
 49: 'पंत',
 50: 'टिहरी',
 51: 'प्रजातियां',
 52: 'कुडई',
 53: 'हुमायुं',
 54: 'एकदूसरे',
 55: 'कॉण्ट्रैक्ट',
 56: 'बताया।',
 57: 'पहाड़ी',
 58: 'अमोनिया',
 59: 'ञुनेरल्',
 60: 'क्रमांक',
 61: 'प्रांतो',
 62: 'चर्चाओं',
 63: 'संवेदनशील',
 64: 'भार',
 65: 'सट्',
 

In [66]:
index_to_english

{0: '<START>',
 1: 'spirituality',
 2: 'tell',
 3: '1350',
 4: 'exyernal',
 5: 'office',
 6: 'field',
 7: 'disasters',
 8: 'hindienglish',
 9: 'koneri',
 10: 'subtropical',
 11: 'estranged',
 12: 'loksabha',
 13: 'cloister',
 14: 'krupacharya',
 15: 'san',
 16: 'wasted',
 17: 'ellora',
 18: 'wang',
 19: 'antenatal',
 20: 'receptions',
 21: 'housing',
 22: 'marking',
 23: 'financial',
 24: 'buffaloes',
 25: 'storm',
 26: 'processes',
 27: 'far',
 28: 'moves',
 29: 'crack',
 30: 'companion',
 31: 'vivekanand12th',
 32: 'chap',
 33: 'nandikeshwar',
 34: 'alert',
 35: 'tours',
 36: 'returned',
 37: 'collectively',
 38: 'abusers',
 39: 'mulayam',
 40: 'thrissur',
 41: 'interchangeably',
 42: 'obstructing',
 43: 'chickens',
 44: 'site',
 45: 'intake',
 46: 'pandavas',
 47: 'parvati',
 48: 'india',
 49: 'bhayander',
 50: 'snowball',
 51: 'factual',
 52: 'marvel',
 53: 'writerdevelopment',
 54: 'torn',
 55: 'lightful',
 56: '3member',
 57: 'supposing',
 58: 'prevent',
 59: 'polished',
 60: 'ca

In [67]:
print(f"Number of sentences: {len(hindi_sentences)}")
print(f"Number of sentences: {len(english_sentences)}")

Number of sentences: 9907
Number of sentences: 9907


In [68]:
def is_valid_tokens (sentence, vocab) :
    return all(token in vocab for token in sentence.split())
    
def is_valid_length (sentence, max_sequence_length) :
    return len(sentence.split()) < (max_sequence_length - 1) # need to re-add the end token so leaving 1 space
    
valid_sentence_indices = [
    index for index, (hindi_sentence, english_sentence) in enumerate(zip(hindi_sentences, english_sentences))
    if is_valid_length(hindi_sentence, max_sequence_length)
    and is_valid_length(english_sentence, max_sequence_length) 
    and is_valid_tokens(hindi_sentence, hindi_vocab)
]

print(f"Number of sentences: {len(hindi_sentences)}")
print(f"Number of valid sentences: {len(valid_sentence_indices)}")

Number of sentences: 9907
Number of valid sentences: 9905


In [69]:
hindi_sentences = [hindi_sentences[i] for i in valid_sentence_indices]
english_sentences = [english_sentences[i] for i in valid_sentence_indices]
print(len(hindi_sentences))
print(len(english_sentences))

9905
9905


In [70]:
class TextDataset (Dataset):
    def __init__(self, english_sentences, hindi_sentences):
        self.english_sentences = english_sentences
        self.hindi_sentences = hindi_sentences
    
    def __len__(self) :
        return len(self.english_sentences)
    
    def __getitem__(self, idx):
        return self.english_sentences[idx], self.hindi_sentences[idx]

In [71]:
dataset = TextDataset(english_sentences, hindi_sentences)
len(dataset)

9905

## Training the model Model

In [72]:
hn_vocab_size = len(hindi_vocab)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [73]:
transformer = Transformer(
    d_model,
    ffn_hidden,
    num_heads, 
    drop_prob, 
    num_layers, 
    max_sequence_length, 
    hn_vocab_size,
    english_to_index, 
    hindi_to_index,
    START_TOKEN, 
    END_TOKEN, 
    PADDING_TOKEN)

torch.Size([15844, 256])
torch.Size([18301, 256])


In [74]:
criterian = nn.CrossEntropyLoss(ignore_index=hindi_to_index[PADDING_TOKEN],
                                reduction= 'none')
# When computing the loss, we are ignoring cases when the label is the padding token
for params in transformer.parameters():
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)
        
optim = torch.optim.Adam(transformer.parameters(), lr=1e-4)

In [75]:
NEG_INFTY = -1e9

def create_masks(eng_batch, hn_batch):
    num_sentences = len(eng_batch)
    look_ahead_mask = torch.full([max_sequence_length, max_sequence_length] , True)
    look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
    encoder_padding_mask = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_self_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_cross_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    
    for idx in range(num_sentences):
        eng_sentence_length, hn_sentence_length = len(eng_batch[idx]), len(hn_batch[idx])
        eng_words_to_padding_mask = np.arange(eng_sentence_length + 1, max_sequence_length)
        hn_words_to_padding_mask = np.arange(hn_sentence_length + 1, max_sequence_length)
        encoder_padding_mask[idx, :, eng_words_to_padding_mask] = True
        encoder_padding_mask[idx, eng_words_to_padding_mask, :] = True
        decoder_padding_mask_self_attention[idx, :, hn_words_to_padding_mask] = True
        decoder_padding_mask_self_attention[idx, hn_words_to_padding_mask, :] = True
        decoder_padding_mask_cross_attention[idx,:, eng_words_to_padding_mask] = True
        decoder_padding_mask_cross_attention[idx, hn_words_to_padding_mask, :] = True
    
    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_self_attention_mask = torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
    # print f" encoder self _attention mask (encoder_self _attention mask size(l}: (encoder self_attention_maskl, :10, :101")
    # print (f"decoder_self_attention mask {decoder_self_attention_mask.size()}: {decoder_self_attention_mask[0, :10, :10]}")
    # print(f" decoder_cross_attention_mask {decoder_cross_attention_mask.size()}: {decoder_cross_attention_mask[0, :10, : 10]}")
    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask

In [76]:
# transformer.train()
transformer.to(device)
total_loss = 0
num_epochs = 10

for epoch in range(num_epochs):
    print (f"Epoch {epoch}")
    iterator = iter(train_loader)
    for batch_num, batch in enumerate(iterator):
        transformer.train()
        eng_batch, hn_batch = batch
        encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_batch, hn_batch)
        optim.zero_grad()
        hn_predictions = transformer(eng_batch,
                                     hn_batch,
                                     encoder_self_attention_mask.to(device), 
                                     decoder_self_attention_mask.to(device), 
                                     decoder_cross_attention_mask.to(device), 
                                     enc_start_token=False, 
                                     enc_end_token=False, 
                                     dec_start_token=True,
                                     dec_end_token=True)
        labels = transformer.decoder.sentence_embedding.batch_tokenize(hn_batch, start_token=False, end_token=True)
        loss = criterian(
            hn_predictions.view(-1, hn_vocab_size).to(device), 
            labels.view(-1).to(device)
        ).to(device)

        valid_indicies = torch.where(labels.view(-1) == hindi_to_index[PADDING_TOKEN], False, True)
        loss = loss.sum() / valid_indicies.sum()
        loss.backward()
        optim.step()
        #train_losses.append(loss.item())
        if batch_num % 100 == 0:
            print(f"Iteration {batch_num} : {loss.item()}")
            print(f"English: {eng_batch[0]}")
            print(f"Hindi Translation: {hn_batch[0]}")
            hn_sentence_predicted = torch.argmax(hn_predictions[0], axis=1)
            predicted_sentence = ""
            for idx in hn_sentence_predicted:
                if idx == hindi_to_index [END_TOKEN]:
                    break
                predicted_sentence += index_to_hindi[idx.item()] + " " # Add a space after each word
            predicted_sentence = predicted_sentence.strip() # Remove the trailing space

            
            transformer.eval()
            hn_sentence = ("",)
            eng_sentence = ("put folder in the bin",)
            for word_counter in range(max_sequence_length):
                encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_sentence, hn_sentence)
                predictions = transformer(eng_sentence,
                                          hn_sentence,
                                          encoder_self_attention_mask.to(device), 
                                          decoder_self_attention_mask.to(device), 
                                          decoder_cross_attention_mask.to(device), 
                                          enc_start_token=False, 
                                          enc_end_token=False, 
                                          dec_start_token=True,
                                          dec_end_token=True)
                
                next_token_prob_distribution = predictions[0][word_counter]# not actual probs
                next_token_index = torch.argmax(next_token_prob_distribution).item()
                next_token = index_to_hindi[next_token_index]
                hn_sentence = (hn_sentence[0] + next_token + " ", ) # Add a space after each word
                if next_token == END_TOKEN:
                    break
            hn_sentence = (hn_sentence[0].strip(),) # Remove the trailing space
            print(f"Evaluation translation (put folder in the bin) : {hn_sentence}")
            print("--------------------------------------------------------------------")


Epoch 0
Iteration 0 : 9.814488410949707
English: it is easy to add 5 to 10
Hindi Translation: पाँच से दस को जोड़ना बहुत आसान है।
Evaluation translation (put folder in the bin) : ('<END>',)
--------------------------------------------------------------------
Iteration 100 : 7.724740505218506
English: second major dam in ganga river is tihari dam is under tihari development project and is situated in tihari district of uttarakhand
Hindi Translation: गंगा पर निर्मित दूसरा प्रमुख बाँध टिहरी बाँध टिहरी विकास परियोजना का एक प्राथमिक बाँध है जो उत्तराखंड प्रान्त के टिहरी जिले में स्थित है।
Evaluation translation (put folder in the bin) : ('<END>',)
--------------------------------------------------------------------
Epoch 1
Iteration 0 : 7.123497486114502
English: akra kay or about army
Hindi Translation: अग्रकय या प्रतिक्रियाशील सैन्य कार्रवाई
Evaluation translation (put folder in the bin) : ('<END>',)
--------------------------------------------------------------------
Iteration 100 : 7.007

In [77]:
def translate(eng_sentence):
    eng_sentence = (eng_sentence, )
    hn_sentence = ("",)
    for word_counter in range(max_sequence_length):
        encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_sentence, hn_sentence)
        predictions = transformer(eng_sentence,
                                  hn_sentence, 
                                  encoder_self_attention_mask.to(device), 
                                  decoder_self_attention_mask.to(device),
                                  decoder_cross_attention_mask.to(device), 
                                  enc_end_token=False, 
                                  dec_start_token=True, 
                                  dec_end_token=False)
        
        next_token_prob_distribution = predictions[0][word_counter]
        next_token_index = torch.argmax(next_token_prob_distribution).item()
        next_token = index_to_hindi[next_token_index]
        hn_sentence = (hn_sentence[0] + next_token + " ", )
        if next_token == END_TOKEN:
            break
    return hn_sentence[0]
    

In [78]:
translation = translate("He exist there")
print(translation)

मैं <END> 
