# Sentence Tokenization

Sentence Tokenization is the first steps we take while working with a transformer neural network. May it be a pre-trained tokenizer like the once's from Hugging Face for a custom defined tokenizer for a new language , tokenization is one of the core concepts in NLP

In [1]:
#----------------Imports----------------#
import torch
import numpy as np

### Data

The data which we will be using for this tutorial is the Enlgish - Hinglish Dataset.

In [2]:
#----------------Create Language files--------------# 
import csv

def extract_columns_to_files(csv_file_path):
    with open(csv_file_path, newline='',encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        headers = next(reader) # extract the headers
        
        # create a new text file for each column
        for i in range(len(headers)):
            filename = headers[i].replace(' ', '_') + '.txt'
            with open(filename, 'w',encoding="utf-8") as outfile:
                for row in reader:
                    outfile.write(row[i])
                # reset reader to the start of the file for the next column
                csvfile.seek(0)
                next(reader)

csv_file_path = '../assets/data/NeuroHack_train.csv'
extract_columns_to_files(csv_file_path)

For Language translation tasks the vocabularies are usually different , but in our case the translation is for English to Hinglish which is in the same latin script.

In [3]:
#--------------Create Vocabularies----------------#
English_file = '../assets/data/English.txt'
Hinglish_file = '../assets/data/Hinglish.txt'

START_TOKEN = '<START>'
PADDING_TOKEN = '<PADDING>'
END_TOKEN = '<END>'

hinglish_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
                       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', '@',
                       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S',
                       'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '‘', ']', '^', '_', '`', '-', 'a', 'b', 'c', 'd', 'e',
                       'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
                       'y', 'z', '{', '|', '}', '~', PADDING_TOKEN, END_TOKEN, '–']

                
english_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
                       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', '@',
                       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S',
                       'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '‘', ']', '^', '_', '`', '-', 'a', 'b', 'c', 'd', 'e',
                       'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
                       'y', 'z', '{', '|', '}', '~', PADDING_TOKEN, END_TOKEN, '–']


In [4]:
text = 'Mera'
list(text)

['M', 'e', 'r', 'a']

Now lets create a dictionary out of these vocabularies so that we can map all the letters

In [5]:
index_to_hinglish = {k:v for k,v in enumerate(hinglish_vocabulary)}
hinglish_to_index = {v:k for k,v in enumerate(hinglish_vocabulary)}
index_to_english = {k:v for k,v in enumerate(english_vocabulary)}
english_to_index = {v:k for k,v in enumerate(english_vocabulary)}

In [6]:
with open(English_file, 'r',encoding='utf-8') as file:
    english_sentences = file.readlines()
with open(Hinglish_file, 'r',encoding='utf-8') as file:
    hinglish_sentences = file.readlines()

# Limit Number of sentences
TOTAL_SENTENCES = 100000
english_sentences = english_sentences[:TOTAL_SENTENCES]
hinglish_sentences = hinglish_sentences[:TOTAL_SENTENCES]
english_sentences = [sentence.rstrip('\n') for sentence in english_sentences]
hinglish_sentences = [sentence.rstrip('\n') for sentence in hinglish_sentences]

In [7]:
english_sentences[:10]

['Program module is a file that contains instructions which are either in the form of source code or machine language.',
 "And to Thamud We sent their brother Sali 'h. He said:' My people! Serve Allah; you have no god other than Him. He brought you into being out of the earth, and has made you dwell in it. So ask Him to forgive you, and do turn towards Him in repentance. Indeed My Lord is near, responsive to prayers.",
 'and, when reminded, do not remember',
 'you won the TED Prize 2011.',
 'He gone to Kerodemal College of Delhi University, where he taken Science Graduate Degree.',
 'Then We awakened them to know which of the two groups could better calculate the length of their stay.',
 'Really move all photos and videos to the trash?',
 'Excavations made in Malabar have proved the existence of chambers in which massive earthenware urns had been kept.',
 'And leave them desolate waste.',
 "Please let me see when you 're online. Thanks!"]

We will limit the input to 350 words as 97%tile sentences have less than 350 words

In [8]:
max(len(x) for x in hinglish_sentences), max(len(x) for x in english_sentences)

(852, 889)

In [9]:
PERCENTILE = 97
print( f"{PERCENTILE}th percentile length Hinglish: {np.percentile([len(x) for x in hinglish_sentences], PERCENTILE)}" )
print( f"{PERCENTILE}th percentile length English: {np.percentile([len(x) for x in english_sentences], PERCENTILE)}" )

97th percentile length Hinglish: 314.1999999999989
97th percentile length English: 284.0499999999997


In [10]:
max_sequence_length = 350

def is_valid_tokens(sentence, vocab):
    for token in list(set(sentence)):
        if token not in vocab:
            return False
    return True

def is_valid_length(sentence, max_sequence_length):
    return len(list(sentence)) < (max_sequence_length - 1) # need to re-add the end token so leaving 1 space

valid_sentence_indicies = []
for index in range(len(hinglish_sentences)):
    hinglish_sentence, english_sentence = hinglish_sentences[index], english_sentences[index]
    if is_valid_length(hinglish_sentence, max_sequence_length) \
      and is_valid_length(english_sentence, max_sequence_length) \
      and is_valid_tokens(hinglish_sentence, hinglish_vocabulary):
        valid_sentence_indicies.append(index)

print(f"Number of sentences: {len(hinglish_sentences)}")
print(f"Number of valid sentences: {len(valid_sentence_indicies)}")

Number of sentences: 2566
Number of valid sentences: 2471


In [11]:
hinglish_sentences = [hinglish_sentences[i] for i in valid_sentence_indicies]
english_sentences = [english_sentences[i] for i in valid_sentence_indicies]

In [12]:
hinglish_sentences[:3]

['module , ek program hoti hai , jismen ya to source code ya machine language ke form men instructions nihit hote hain.',
 'aur jab unhen yad dilaya jata hai , to ve yad nahin karte ,',
 'unhonne bad science karne ke lie ye delhi university ke kirorimal college chale ge jahan inhonne vijnjan graduate ki degree prapt ki.']

### Dataset

Now we will build a dataset (torch or HF). This is needed to carryout for smooth batching process & consistent input format to the model.

In [13]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):

    def __init__(self, english_sentences, hinglish_sentences):
        self.english_sentences = english_sentences
        self.hinglish_sentences = hinglish_sentences

    def __len__(self):
        return len(self.english_sentences)

    def __getitem__(self, idx):
        return self.english_sentences[idx], self.hinglish_sentences[idx]

In [14]:
dataset = TextDataset(english_sentences, hinglish_sentences)

In [15]:
len(dataset)

2471

In [16]:
dataset[1]

('and, when reminded, do not remember',
 'aur jab unhen yad dilaya jata hai , to ve yad nahin karte ,')

### Batch

If we use the dataset now as it is, we will perform an Stochastic Gradient Descent which in our case will take a very very long time to compute. Hence we will use a Batched Gradient Descent.

It is the number of input sentences we will be sending to the model before its loss is final & it can backpropogates and update the gradients. Generally we use anything from 16-64.

In [17]:
batch_size = 16 
train_loader = DataLoader(dataset, batch_size)
iterator = iter(train_loader)

In [18]:
for batch_num, batch in enumerate(iterator):
    print(batch)
    if batch_num > 16:
        break    

[('Program module is a file that contains instructions which are either in the form of source code or machine language.', 'and, when reminded, do not remember', 'He gone to Kerodemal College of Delhi University, where he taken Science Graduate Degree.', 'Then We awakened them to know which of the two groups could better calculate the length of their stay.', 'Really move all photos and videos to the trash?', 'Excavations made in Malabar have proved the existence of chambers in which massive earthenware urns had been kept.', 'And leave them desolate waste.', 'A guidance and a mercy for the well - doers:', 'In France, the news of one deed spreads like a flash and brings some pride to a disillusioned people.', 'refuse him / her the permission to see your presence', 'Disable the new tab page menu for accessing tabs on other devices.', 'Far from securing the ends of justice their pendency or disposal on merits will indubitably defeat the ends of justice.', 'It is He who prevails over His cre

### Tokenization

Now we will map all these batches to their respective values from the dict created above.

In [19]:
def tokenize(sentence, language_to_index, start_token=True, end_token=True):
    sentence_word_indicies = [language_to_index[token] for token in list(sentence)]
    if start_token:
        sentence_word_indicies.insert(0, language_to_index[START_TOKEN])
    if end_token:
        sentence_word_indicies.append(language_to_index[END_TOKEN])
    for _ in range(len(sentence_word_indicies), max_sequence_length):
        sentence_word_indicies.append(language_to_index[PADDING_TOKEN])
    return torch.tensor(sentence_word_indicies)

In [20]:
batch

[('It is indeed a matter of introspection as to how, despite availability of agricultural labour and sizeable arable land, notwithstanding some decline in Asia, many Asian and African countries have witnessed spiralling food prices.',
  'These are the ones who found their fathers steeped in error,',
  'When he came to his Lord with a pure heart [attached to Allah Alone and none else, worshipping none but Allah Alone true Islamic Monotheism, pure from the filth of polytheism].',
  'The rate of interest charged on cash credit and overdraft is relatively much higher than the rate of interest on bank deposits',
  'Loan amount up to Rs. 7. 50 lacs: 10 years.',
  'Ans : Twenty-eight (28),11.71%',
  'Sucharita said, Does respectful regard always show us the truth?',
  'International usages are adopted in international communication by most of the countries of the world.',
  'No such items having value of Rs. 10 lakh or more has been procured by PMO during the financial year 2015-16.',
  'It a

In [21]:
eng_tokenized, hing_tokenized = [], []
for sentence_num in range(batch_size):
    eng_sentence, hing_sentence = batch[0][sentence_num], batch[1][sentence_num]
    eng_tokenized.append( tokenize(eng_sentence, english_to_index, start_token=False, end_token=False) )
    hing_tokenized.append( tokenize(hing_sentence, hinglish_to_index, start_token=True, end_token=True) )
eng_tokenized = torch.stack(eng_tokenized)
hing_tokenized = torch.stack(hing_tokenized)

In [22]:
eng_tokenized

tensor([[41, 85,  1,  ..., 96, 96, 96],
        [52, 73, 70,  ..., 96, 96, 96],
        [55, 73, 70,  ..., 96, 96, 96],
        ...,
        [57, 70, 84,  ..., 96, 96, 96],
        [54, 74, 69,  ..., 96, 96, 96],
        [48, 86, 67,  ..., 96, 96, 96]])

### Masking

For the Encoder inputs we dont need masked tokens , but while using the decoder in inference we won’t have the next words to look at as they are not yet predicted yet.

Therefore we generated masked tokens to input to the decoder while training.

We fill the mask with 0 & -infinity values in an upper trangular matrix of the shape (len_of_sentence ,batch_size). This when passed through the softmax funnction of the transformer will be converted into $e^0$ → 1   & $e^{-\inf}$ → 0. Which means the model can not understand meaning of the values which are zero.

In [23]:
NEG_INFTY = -1e9

def create_masks(eng_batch, kn_batch):
    num_sentences = len(eng_batch)
    look_ahead_mask = torch.full([max_sequence_length, max_sequence_length] , True)
    look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
    encoder_padding_mask = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_self_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_cross_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)

    for idx in range(num_sentences):
      eng_sentence_length, kn_sentence_length = len(eng_batch[idx]), len(kn_batch[idx])
      eng_chars_to_padding_mask = np.arange(eng_sentence_length + 1, max_sequence_length)
      kn_chars_to_padding_mask = np.arange(kn_sentence_length + 1, max_sequence_length)
      encoder_padding_mask[idx, :, eng_chars_to_padding_mask] = True
      encoder_padding_mask[idx, eng_chars_to_padding_mask, :] = True
      decoder_padding_mask_self_attention[idx, :, kn_chars_to_padding_mask] = True
      decoder_padding_mask_self_attention[idx, kn_chars_to_padding_mask, :] = True
      decoder_padding_mask_cross_attention[idx, :, eng_chars_to_padding_mask] = True
      decoder_padding_mask_cross_attention[idx, kn_chars_to_padding_mask, :] = True

    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_self_attention_mask =  torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
    print(f"encoder_self_attention_mask {encoder_self_attention_mask.size()}: {encoder_self_attention_mask[0, :10, :10]}")
    print(f"decoder_self_attention_mask {decoder_self_attention_mask.size()}: {decoder_self_attention_mask[0, :10, :10]}")
    print(f"decoder_cross_attention_mask {decoder_cross_attention_mask.size()}: {decoder_cross_attention_mask[0, :10, :10]}")
    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask

In [24]:
create_masks(batch[0], batch[1])

encoder_self_attention_mask torch.Size([16, 350, 350]): tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
decoder_self_attention_mask torch.Size([16, 350, 350]): tensor([[ 0.0000e+00, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00, -1.0000e+09, -1.0000e+09,
         -1.000

(tensor([[[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          ...,
          [-1.0000e+09, -1.0000e+09, -1.0000e+09,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          [-1.0000e+09, -1.0000e+09, -1.0000e+09,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          [-1.0000e+09, -1.0000e+09, -1.0000e+09,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09]],
 
         [[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          ...,
    

# Class

In [25]:
from torch import nn

In [26]:
class SentenceEmbedding(nn.Module):
    "For a given sentence, create an embedding"
    def __init__(self, max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN):
        super().__init__()
        self.vocab_size = len(language_to_index)
        self.max_sequence_length = max_sequence_length
        self.embedding = nn.Embedding(self.vocab_size, d_model)
        self.language_to_index = language_to_index
        self.position_encoder = PositionalEncoding(d_model, max_sequence_length)
        self.dropout = nn.Dropout(p=0.1)
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN
    
    def batch_tokenize(self, batch, start_token=True, end_token=True):

        def tokenize(sentence, start_token=True, end_token=True):
            sentence_word_indicies = [self.language_to_index[token] for token in list(sentence)]
            if start_token:
                sentence_word_indicies.insert(0, self.language_to_index[self.START_TOKEN])
            if end_token:
                sentence_word_indicies.append(self.language_to_index[self.END_TOKEN])
            for _ in range(len(sentence_word_indicies), self.max_sequence_length):
                sentence_word_indicies.append(self.language_to_index[self.PADDING_TOKEN])
            return torch.tensor(sentence_word_indicies)

        tokenized = []
        for sentence_num in range(len(batch)):
            tokenized.append( tokenize(batch[sentence_num], start_token, end_token) )
        tokenized = torch.stack(tokenized)
        return tokenized.to(get_device())
    
    def forward(self, x, end_token=True): # sentence
        x = self.batch_tokenize(x ,end_token)
        x = self.embedding(x)
        pos = self.position_encoder().to(get_device())
        x = self.dropout(x + pos)
        return x