In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import torch
import numpy as np
from transformers import AdamW, AutoModelForSeq2SeqLM, AutoTokenizer, T5ForConditionalGeneration, T5Tokenizer
from transformers import get_linear_schedule_with_warmup
from tqdm import notebook
import gc
gc.enable()
sns.set()

In [None]:
df = pd.read_csv("Combined.csv")

In [None]:
X = df['Title']
Y = df['Urdu']

In [None]:
def encode_input_str(text, tokenizer, seq_len):
    input_ids = tokenizer.encode(
        text=text,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=seq_len)

    return input_ids[0]

def encode_target_str(text, tokenizer, seq_len):
    token_ids = tokenizer.encode(
        text=text,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=seq_len)

    return token_ids[0]

def format_translation_data(x, y, tokenizer, seq_len=128):
    input_text = x
    target_text = y

    if input_text is None or target_text is None:
        return None

    input_token_ids = encode_input_str(input_text, tokenizer, seq_len)

    target_token_ids = encode_target_str(target_text, tokenizer, seq_len)

    return input_token_ids, target_token_ids

def transform_batch(batch_x, batch_y, tokenizer):
    inputs = []
    targets = []
    for i in range(len(batch_x)):
        formatted_data = format_translation_data(batch_x[i], batch_y[i], tokenizer)
        if formatted_data is None:
            continue

        input_ids, target_ids = formatted_data
        inputs.append(input_ids.unsqueeze(0))
        targets.append(target_ids.unsqueeze(0))
    batch_input_ids = torch.cat(inputs).cuda()
    # batch_input_ids = torch.cat(inputs)
    batch_target_ids = torch.cat(targets).cuda()
    # batch_target_ids = torch.cat(targets)

    return batch_input_ids, batch_target_ids

def get_data_generator(train_x, train_y, tokenizer, batch_size=16):
    for i in range(0, len(train_x), batch_size):
        batch_x = train_x[i:i+batch_size]
        batch_y = train_y[i:i+batch_size]
        yield transform_batch(batch_x, batch_y, tokenizer)
        
def eval_model(model, x, y, tokenizer, max_iters=8):
    test_generator = get_data_generator(x, y, tokenizer)
    eval_losses = []
    for i, (input_batch, label_batch) in enumerate(test_generator):
        if i >= max_iters:
            break
        model_out = model.forward(
            input_ids=input_batch,
            labels=label_batch)
        eval_losses.append(model_out.loss.item())

    return np.mean(eval_losses)

In [None]:
import os
PATH = 'Models'
# os.mkdir(PATH)
n_epochs = 20
batch_size = 8
print_freq = 50
checkpoint_freq = 25
lr = 5e-4
n_batches = int(np.ceil(len(df)*0.8 / batch_size))
total_steps = n_epochs * n_batches
n_warmup_steps = int(total_steps * 0.01)
losses = []

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6b")

model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6b")

In [None]:
torch.cuda.empty_cache()

In [None]:
torch.cuda.empty_cache()
model_path = PATH
saving_path = PATH
max_seq_len = 20
# tokenizer = AutoTokenizer.from_pretrained('t5-base')
# model = AutoModelForSeq2SeqLM.from_pretrained('charsiu/g2p_multilingual_mT5_small')
model = model.cuda()
optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(optimizer, n_warmup_steps, total_steps)
model.resize_token_embeddings(len(tokenizer))
for epoch_idx in range(n_epochs):
# Randomize data order
    data_generator = get_data_generator(x,y, tokenizer, batch_size)
    # print(data_generator)
    for batch_idx, (input_batch, label_batch) in notebook.tqdm(enumerate(data_generator), total=n_batches):
        optimizer.zero_grad()
        # print(batch_idx)
        # print(input_batch)
        # Forward pass
        model_out = model.forward(input_ids = input_batch, labels = label_batch)

        # Calculate loss and update weights
        loss = model_out.loss
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print training update info
        if (batch_idx + 1) % print_freq == 0:
            avg_loss = np.mean(losses[-print_freq:])
            print('Epoch: {} | Step: {} | Avg. loss: {:.3f} | lr: {}'.format(
                epoch_idx+1, batch_idx+1, avg_loss, scheduler.get_last_lr()[0]))
            
        if (batch_idx + 1) % checkpoint_freq == 0:
            test_loss = eval_model(model, x,y,tokenizer)
            print('Saving model with test loss of {:.3f}'.format(test_loss))
            torch.save(model.state_dict(), model_path+'t5')
            model.save_pretrained(saving_path)

torch.save(model.state_dict(), model_path)
window_size = 50


#Emptying cache
torch.cuda.empty_cache()

In [None]:

model.save_pretrained(PATH)