In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from multi_stage import LLM
from prep_data import get_eng_hi_dataset
from transformers import GPT2LMHeadModel, MT5Tokenizer

device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
device
# device = torch.device('cpu')

# torch.backends.cudnn.enabled = True
# torch.backends.cudnn.benchmark = True

device(type='cuda', index=1)

<h2>Obtain parallel data (EN-HI)</h2>
<h5>Data is in the form of dictionary with 'en' and 'hi' keys corresponding to english and hindi sentences respectively</h5>

In [2]:
val_split = 0.8
train_data, val_data, test_data = get_eng_hi_dataset(val_split=val_split)

Found cached dataset parquet (/home1/tejomay/.cache/huggingface/datasets/cfilt___parquet/cfilt--iitb-english-hindi-911387c6837f8b91/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
len(train_data), len(val_data), len(test_data)

(1319836, 331787, 2507)

In [4]:
class ParallelCorpus(Dataset):
    def __init__(self, data, src_lang='en', tgt_lang='hi') -> None:
        super(ParallelCorpus, self).__init__()
        self.src = []
        self.tgt = []
        for pair in data:
            self.src.append(pair[src_lang])
            self.tgt.append(pair[tgt_lang])
        
    def __len__(self):
        return len(self.src)
    
    def __getitem__(self, index):
        return self.src[index], self.tgt[index]

train_pc = ParallelCorpus(train_data, src_lang='en', tgt_lang='hi')
test_pc = ParallelCorpus(test_data, src_lang='en', tgt_lang='hi')

<h2>Hyperparameters</h2>

In [2]:
len_prefix = 100
lr = 0.001
beta1 = 0.9
beta2 = 0.98
batch_size = 1
num_epochs = 100
token_limit = (1023 - len_prefix) // 2

In [6]:
train_loader = DataLoader(dataset=train_pc, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset=test_pc, batch_size=batch_size, shuffle=False)

In [3]:
tokenizer = MT5Tokenizer.from_pretrained("THUMT/mGPT")
model = GPT2LMHeadModel.from_pretrained("THUMT/mGPT")
for param in model.parameters():
    param.requires_grad_(False)

MT_model = LLM(model, len_prefix).to(device)
optimizer = torch.optim.Adam(params=MT_model.prefix.parameters(),lr=lr, betas=(beta1, beta2), eps=1e-9)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'T5Tokenizer'.


<h2>Training</h2>

In [10]:
for epoch in range(num_epochs):
    print(f"------------------------EPOCH {epoch + 1}-------------------------------")
    for i, (src, tgt) in enumerate(train_loader):
        MT_model.zero_grad()
        
        max_src_len = min(token_limit, max([len(s) for s in src]))
        max_tgt_len = min(token_limit, max([len(s) for s in tgt]))
        inputs = tokenizer(src, padding='max_length', truncation=True, max_length=max_src_len)
        targets = tokenizer(tgt, padding='max_length', truncation=True, max_length=max_tgt_len)
        input_ids, input_masks = inputs['input_ids'], inputs['attention_mask']
        target_ids, target_masks = targets['input_ids'], targets['attention_mask']
        for j in range(len(target_ids)):
            target_ids[j].insert(0, 1)
            target_masks[j].insert(0, 1)
#         print(len(input_ids[0]))
#         print(MT_model._model.config.max_position_embeddings)
#         print(tgt[0], target_ids[0])
        
        input_ids, input_masks = torch.tensor(input_ids).to(device), torch.tensor(input_masks).to(device)
        target_ids, target_masks = torch.tensor(target_ids).to(device), torch.tensor(target_masks).to(device)
        loss = MT_model(input_ids, input_masks, target_ids, target_masks)
        loss.backward()
        optimizer.step()
        if (i+1)%500 == 0:
            print(f'Step {i+1} | Loss: {loss.item():.5f}')
                

------------------------EPOCH 1-------------------------------
Step 500 | Loss: 2.21830
Step 1000 | Loss: 1.36049
Step 1500 | Loss: 1.52940
Step 2000 | Loss: 1.50761
Step 2500 | Loss: 0.87320
Step 3000 | Loss: 1.94154
Step 3500 | Loss: 0.76754
Step 4000 | Loss: 1.29425
Step 4500 | Loss: 1.22058


KeyboardInterrupt: 

In [4]:
sent = ["How can I help you"]
# tokenizer(sent)
MT_model.translate(device, tokenizer, sent)   



torch.Size([1, 250100])


RuntimeError: The size of tensor a (108) must match the size of tensor b (107) at non-singleton dimension 3

In [17]:
a = ["आप कैसे हैं"]
b = tokenizer(a)["input_ids"]
b

[[3186, 12604, 2205, 1896, 1]]

In [16]:
tokenizer.decode(b[0])

'आप कैसे हैं</s>'

In [None]:
for src, tgt in train_loader:
    print(src)