In [1]:
from transformers import AutoTokenizer
import torch

In [2]:
# To get lang_id use any of ['<2as>', '<2bn>', '<2en>', '<2gu>', '<2hi>', '<2kn>', '<2ml>', '<2mr>', '<2or>', '<2pa>', '<2ta>', '<2te>']
def token_to_ids():
    bos_id = tokenizer._convert_token_to_id_with_added_voc("<s>") #beginning of sentence
    eos_id = tokenizer._convert_token_to_id_with_added_voc("</s>") #end of sentence
    pad_id = tokenizer._convert_token_to_id_with_added_voc("<pad>") #padding id
    eng_id = tokenizer._convert_token_to_id_with_added_voc("<2en>") #language id
    hindi_id= tokenizer._convert_token_to_id_with_added_voc("<2hi>") #language id

    return (bos_id, eos_id, pad_id, eng_id, hindi_id)



In [3]:
#reading english hindi files containing 6,59,083 lines
# https://www.cfilt.iitb.ac.in/iitb_parallel/iitb_corpus_download/parallel.zip
def read_files():
    with open("./parallel-n/IITB_english.txt","r") as file1:
        eng_file= file1.readlines()

    with open("./parallel-n/IITB_hindi.txt") as file2:
        hindi_file= file2.readlines()

    return (eng_file, hindi_file)


In [4]:
'''
the format of input should be {text} {eos token} {language id}
"I am a boy </s> <2en>"
'''
def change_eng_text(input):
    input= input.split("\n")[0]
    data= input + " </s> " + "<2en>"
    return data


'''
the format of output will be {language id} {text} {eos token}
"<2hi> मैं  एक लड़का हूँ </s>" '''
def change_hindi_text(input):
    input= input.split("\n")[0]
    data= "<2hi> " + input + " </s>"
    return data

In [5]:
def save_tensor(num_of_lines):
    eng_file, hindi_file= read_files()
    #tokenization of both files containing english and hindi text
    tokenizer= AutoTokenizer.from_pretrained("ai4bharat/IndicBART", do_lower_case=True, use_fast=False, keep_accents=True)

    eng_tokens=[]
    hindi_tokens=[]
    #num_of_lines= 659083

    for i in range(num_of_lines):
        eng_text= eng_file[i]
        hindi_text= hindi_file[i]
        
        eng_tokens.append(tokenizer(change_eng_text(eng_text), 
                                    padding=True, truncation=True, 
                                    return_tensors="pt")["input_ids"])
        
        hindi_tokens.append(tokenizer(change_hindi_text(hindi_text), 
                                      padding=True, truncation=True, 
                                      return_tensors="pt")["input_ids"])
        

    torch.save(eng_tokens,"split_eng_tensors.pt")
    torch.save(hindi_tokens,"split_hindi_tensors.pt")

In [6]:
def load_tensors(eng_file, hindi_file):
    eng_tensors= torch.load(eng_file)
    hindi_tensors= torch.load(hindi_file)
    return (eng_tensors, hindi_tensors)

In [7]:
def splitting_data(eng_tensor, hindi_tensor):
    #n= int(0.8*659083)
    #b= int(0.9*659083)

    n= 80
    b= 90

    training_eng_data= eng_tensor[:n]
    val_eng_data= eng_tensor[n:b]
    test_eng_data= eng_tensor[b:]

    training_hindi_data= hindi_tensor[:n]
    val_hindi_data= hindi_tensor[n:b]
    test_hindi_data= hindi_tensor[b:]

    return (training_eng_data, val_eng_data, test_eng_data,
        training_hindi_data, val_hindi_data, test_hindi_data)


In [8]:
#save_tensor(100)

eng, hindi= load_tensors("split_eng_tensors.pt", "split_hindi_tensors.pt")

for i in splitting_data(eng, hindi):
    print(i[:2],"\n")


[tensor([[    2,  7338,  3163, 25690,   494, 12698, 37568,  2781,  6386, 64001,
         64004,     3]]), tensor([[    2, 18941,   536,  5129,    36,   536, 12698, 37568, 42033,  1035,
         64001, 64004,     3]])] 

[tensor([[    2, 31423,    67, 11341, 29783, 64001, 64004,     3]]), tensor([[    2, 30097, 16526, 64001, 64004,     3]])] 

[tensor([[    2,  4440, 64001, 64004,     3]]), tensor([[    2,  4636,  3153, 36355,  2588,  6724, 64001, 64004,     3]])] 

[tensor([[    2, 64006,   452, 51891,    45,  4959,  9760,    86,  8103,    44,
           442,  4150, 64001,     3]]), tensor([[    2, 64006,  4049,    12,  1544,   268,  1394,  4959,  9760,    86,
         36655,    20, 64001,     3]])] 

[tensor([[    2, 64006, 17209,  5618, 64001,     3]]), tensor([[    2, 64006, 31849,   802, 64001,     3]])] 

[tensor([[    2, 64006,  1838, 64001,     3]]), tensor([[    2, 64006,   409,   378, 15212,  1423,  1256,   122, 64001,     3]])] 

