In [1]:
# Importing libraries
import json
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import os

# Importing the T5 modules from huggingface/transformers
from transformers import BartTokenizer, BartForConditionalGeneration

# rich: for a better display on terminal
from rich.table import Column, Table
from rich import box
from rich.console import Console

# define a rich console logger
console = Console(record=True)

# to display dataframe in ASCII format
def display_df(df):
    """display dataframe in ASCII format"""

    console = Console()
    table = Table(
        Column("source_text", justify="center"),
        Column("target_text", justify="center"),
        title="Sample Data",
        pad_edge=False,
        box=box.ASCII,
    )

    for i, row in enumerate(df.values.tolist()):
        table.add_row(row[0], row[1])

    console.print(table)

# training logger to log training progress
training_logger = Table(
    Column("Epoch", justify="center"),
    Column("Steps", justify="center"),
    Column("Loss", justify="center"),
    title="Training Status",
    pad_edge=False,
    box=box.ASCII,
)




In [2]:
!nvidia-smi

Wed May  4 01:11:38 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  On   | 00000000:01:00.0 Off |                    0 |
| N/A   27C    P0    52W / 400W |      3MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM...  On   | 00000000:41:00.0 Off |                    0 |
| N/A   59C    P0   311W / 400W |  39212MiB / 40536MiB |     99%      Default |
|       

In [3]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda:0' if cuda.is_available() else 'cpu'

In [4]:
class YourDataSetClass(Dataset):
    """
    Creating a custom dataset for reading the dataset and
    loading it into the dataloader to pass it to the
    neural network for finetuning the model

    """

    def __init__(
        self, dataframe, tokenizer, source_len, target_len, source_text, target_text
    ):
        """
        Initializes a Dataset class

        Args:
            dataframe (pandas.DataFrame): Input dataframe
            tokenizer (transformers.tokenizer): Transformers tokenizer
            source_len (int): Max length of source text
            target_len (int): Max length of target text
            source_text (str): column name of source text
            target_text (str): column name of target text
        """
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = target_len
        self.target_text = self.data[target_text]
        self.source_text = self.data[source_text]

    def __len__(self):
        """returns the length of dataframe"""

        return len(self.target_text)

    def __getitem__(self, index):
        """return the input ids, attention masks and target ids"""

        source_text = str(self.source_text[index])
        target_text = str(self.target_text[index])

        # cleaning data so as to ensure data is in string type
        source_text = " ".join(source_text.split())
        target_text = " ".join(target_text.split())

        source = self.tokenizer.batch_encode_plus(
            [source_text],
            max_length=self.source_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        target = self.tokenizer.batch_encode_plus(
            [target_text],
            max_length=self.summ_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()
        target_ids = target["input_ids"].squeeze()
        target_mask = target["attention_mask"].squeeze()

        return {
            "source_ids": source_ids.to(dtype=torch.long),
            "source_mask": source_mask.to(dtype=torch.long),
            "target_ids": target_ids.to(dtype=torch.long),
            "target_ids_y": target_ids.to(dtype=torch.long),
        }

In [6]:
def train(epoch, tokenizer, model, device, loader, optimizer):

    """
    Function to be called for training with the parameters passed from main function

    """

    model.train()
    for _, data in enumerate(loader, 0):
        y = data["target_ids"].to(device, dtype=torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data["source_ids"].to(device, dtype=torch.long)
        mask = data["source_mask"].to(device, dtype=torch.long)

        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            decoder_input_ids=y_ids,
            labels=lm_labels,
        )
        loss = outputs[0]

        if _ % 500 == 0:
            training_logger.add_row(str(epoch), str(_), str(loss))
            console.print(training_logger)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [7]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    inputs = []
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            y_ids = y[:, :-1].contiguous()
            lm_labels = y[:, 1:].clone().detach()
            lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)
            

            generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=512, 
                min_length = 250,
              num_beams = 5,
              no_repeat_ngram_size = 5,
              #topp = 0.9,
              #do_sample=True,
              repetition_penalty=5.8, 
              length_penalty=1, 
              early_stopping=True
              )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            input_text = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in ids]
            if _%50==0:
                outputs = model(
                        input_ids=ids,
                        attention_mask=mask,
                        decoder_input_ids=y_ids,
                        labels=lm_labels,
                        )
                loss = outputs[0]
                console.print(f'Completed {_}')
                console.print('loss: '+ str(loss))
            
            predictions.extend(preds)
            actuals.extend(target)
            inputs.extend(input_text)
    return inputs, predictions, actuals


In [8]:
def generate(tokenizer, model, device, loader):
    model.eval()
    inputs = []
    predictions = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            y_ids = y[:, :-1].contiguous()
            lm_labels = y[:, 1:].clone().detach()
            lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)
            

            generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=5120, 
                min_length = 250,
              num_beams = 4,
              no_repeat_ngram_size = 5,
              #topp = 0.9,
              #do_sample=True,
              repetition_penalty=5.8, 
              length_penalty=1, 
              early_stopping=True
              )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            input_text = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in ids]
            if _%50==0:
                console.print(f'Completed {_}')

            predictions.extend(preds)
            inputs.extend(input_text)
    return inputs, predictions


In [9]:
def T5Trainer(
    dataframe, source_text, target_text, model_params, model, tokenizer, output_dir="./outputs/"
):

    """
    T5 trainer

    """

    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(model_params["SEED"])  # pytorch random seed
    np.random.seed(model_params["SEED"])  # numpy random seed
    torch.backends.cudnn.deterministic = True

    # logging
    console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

    

    # logging
    console.log(f"[Data]: Reading data...\n")

    # Importing the raw dataset
    dataframe = dataframe[[source_text, target_text]]
    display_df(dataframe.head(2))

    # Creation of Dataset and Dataloader
    # Defining the train size. So x% of the data will be used for training and the rest for validation.
    train_size = 0.998
    train_dataset = dataframe.sample(frac=train_size, random_state=model_params["SEED"])
    val_dataset = dataframe.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    console.print(f"FULL Dataset: {dataframe.shape}")
    console.print(f"TRAIN Dataset: {train_dataset.shape}")
    console.print(f"TEST Dataset: {val_dataset.shape}\n")

    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = YourDataSetClass(
        train_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )
    val_set = YourDataSetClass(
        val_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )

    # Defining the parameters for creation of dataloaders
    train_params = {
        "batch_size": model_params["TRAIN_BATCH_SIZE"],
        "shuffle": True,
        "num_workers": 0,
    }

    val_params = {
        "batch_size": model_params["VALID_BATCH_SIZE"],
        "shuffle": False,
        "num_workers": 0,
    }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)

    # Defining the optimizer that will be used to tune the weights of the network in the training session.
    optimizer = torch.optim.Adam(
        params=model.parameters(), lr=model_params["LEARNING_RATE"]
    )

    # Training loop
    console.log(f"[Initiating Fine Tuning]...\n")

    for epoch in range(model_params["TRAIN_EPOCHS"]):
        train(epoch, tokenizer, model, device, training_loader, optimizer)
        console.log(f"[Initiating Validation]...\n")
        inputs, predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({'Input': inputs, "Generated Text": predictions, "Actual Text": actuals})
        final_df.to_csv(os.path.join(output_dir, "predictions"+str(epoch)+".csv"))

    console.log(f"[Saving Model]...\n")
    # Saving the model after training
    path = os.path.join(output_dir, "model_files")
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)

    # evaluating test dataset
    

    console.save_text(os.path.join(output_dir, "logs.txt"))

    console.log(f"[Validation Completed.]\n")
    console.print(
        f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n"""
    )
    console.print(
        f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n"""
    )
    console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")


In [10]:
f = open('all_stories_14_lines_new_mix_.json', errors='ignore').readlines()
all_scary =  json.loads(f[0])

In [11]:
len(all_scary)

19267

In [12]:
all_poetry_foundation = all_scary

In [13]:
len(all_poetry_foundation)

19267

In [17]:
special_token = '.'
mask_token = '<MASK>'
eos_token = '</s>'

X_titles = []
y_keywords = []
template = [mask_token, mask_token, mask_token]
prompt = 'Generate keywords for the title: '
title_set = []

for poem in all_poetry_foundation:
    title_set.append(poem['Theme'])
    title = prompt + poem['Theme']
    paddings = []
    temp = []
    count = 0
    for key in poem['keywords']:
        if key == ['<paragraph>']:
            continue
        count += 1
        mask = template[:len(key)]
        paddings.append('Keywords '+ str(count) + ': '+ str(mask) )        
        temp.append('Keywords '+ str(count) + ': '+ str(key))

    paddings = (" "+special_token+" ").join(paddings).replace('<paragraph> ','')
    temp = (" "+special_token+" ").join(temp).replace('<paragraph> ','')

    X_titles.append(title + '. ' + paddings+" "+eos_token)
    #X_titles.append(title)
    y_keywords.append(temp+" "+eos_token) 

In [18]:
data = [X_titles, y_keywords]
df = pd.DataFrame(np.array(data).T, columns = ['title', 'keywords'])
df.head()

Unnamed: 0,title,keywords
0,Generate keywords for the title: The end of re...,"Keywords 1: ['worked', 'reddit', 'long'] . Key..."
1,Generate keywords for the title: No one mourns...,"Keywords 1: ['October', 'silence', 'rode'] . K..."
2,Generate keywords for the title: `` Captain 's...,"Keywords 1: ['Captain', 'Log', 'Recorded'] . K..."
3,Generate keywords for the title: You develop r...,"Keywords 1: ['magic', 'quantum-mechanical', 'm..."
4,Generate keywords for the title: You 've just ...,"Keywords 1: ['hall', 'walk', 'quickly'] . Keyw..."


In [19]:
df['title'][10005]

"Generate keywords for the title: Wiggle your big toe.... Keywords 1: ['<MASK>', '<MASK>', '<MASK>'] . Keywords 2: ['<MASK>', '<MASK>', '<MASK>'] . Keywords 3: ['<MASK>', '<MASK>', '<MASK>'] . Keywords 4: ['<MASK>', '<MASK>', '<MASK>'] . Keywords 5: ['<MASK>', '<MASK>', '<MASK>'] . Keywords 6: ['<MASK>', '<MASK>', '<MASK>'] . Keywords 7: ['<MASK>', '<MASK>', '<MASK>'] . Keywords 8: ['<MASK>', '<MASK>', '<MASK>'] . Keywords 9: ['<MASK>', '<MASK>', '<MASK>'] . Keywords 10: ['<MASK>', '<MASK>', '<MASK>'] . Keywords 11: ['<MASK>', '<MASK>', '<MASK>'] . Keywords 12: ['<MASK>', '<MASK>', '<MASK>'] . Keywords 13: ['<MASK>', '<MASK>', '<MASK>'] . Keywords 14: ['<MASK>', '<MASK>', '<MASK>'] </s>"

In [20]:
df['keywords'][10005]

"Keywords 1: ['paralysis', 'suffer', 'pretty'] . Keywords 2: ['sleep', 'anxiety', 'high'] . Keywords 3: ['quick', 'times', 'crushing'] . Keywords 4: ['awake', 'hallucinating', 'calm'] . Keywords 5: ['Panic', 'made', 'harder'] . Keywords 6: ['Tarantino', 'back', 'picked'] . Keywords 7: ['sweat', 'glued', 'bed'] . Keywords 8: ['Starting', 'move', 'toe'] . Keywords 9: ['tonight', 'woke', 'freezing'] . Keywords 10: ['covers', 'air', 'sleep'] . Keywords 11: ['mattress', 'feel', 'usual'] . Keywords 12: ['instantly', 'anxiety', 'rushed'] . Keywords 13: ['nose', 'shut', 'pulled'] . Keywords 14: ['Wiggle', 'toe', 'big'] </s>"

In [22]:

# let's define model parameters specific to bart
model_params = {
    "TASK" : "0503-mix",
    "MODEL": "facebook/bart-large",  # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE": 8,  # training batch size
    "VALID_BATCH_SIZE": 8,  # validation batch size
    "TRAIN_EPOCHS": 6,  # number of training epochs
    "VAL_EPOCHS": 1,  # number of validation epochs
    "LEARNING_RATE": 3e-6,  # learning rate
    "MAX_SOURCE_TEXT_LENGTH": 512,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH": 512,  # max length of target text
    "SEED": 42,  # set seed for reproducibility
}


In [15]:
# tokenzier for encoding the text
tokenizer = BartTokenizer.from_pretrained(model_params["MODEL"])



In [16]:
tokenizer.eos_token

'</s>'

In [23]:
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = BartForConditionalGeneration.from_pretrained(model_params["MODEL"])
model = model.to(device)

In [24]:
tokens = tokenizer(df['title'][15005])

len(tokens['input_ids'])

337

In [26]:
df['keywords'][15005]

"Keywords 1: ['wrong', 'things', 'presence'] . Keywords 2: ['Silhouettes', 'yard', 'steps'] . Keywords 3: ['safe', 'feel', 'cozy'] . Keywords 4: ['air', 'lifted', 'today'] . Keywords 5: ['grounds', 'spirits', 'inhabit'] . Keywords 6: ['mist', 'wrong', 'deeply'] . Keywords 7: ['mind', 'cottage', 'engulfed'] . Keywords 8: ['legs', 'dog', 'tail'] . Keywords 9: ['reached', 'house', 'wall'] . Keywords 10: ['silence', 'air', 'shook'] . Keywords 11: ['rumble', 'thunder', 'localized'] . Keywords 12: ['animal', 'slumber', 'horrible'] . Keywords 13: ['cottage', 'glance', 'dashed'] . Keywords 14: ['ran', 'life', 'cabin'] </s>"

In [25]:
tokens = tokenizer(df['keywords'][15005])

len(tokens['input_ids'])

224

In [27]:
output_dir= model_params['MODEL']+"_batch_"+ str(model_params['TRAIN_BATCH_SIZE']) + "_lr_"+ str(model_params['LEARNING_RATE'])+ model_params['TASK']
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [None]:
#GPU usage: 37186 MB for T5 large, batch_size 3
#GPU usage:  MB for T5 base, batch_size 12
T5Trainer(
    dataframe=df,
    source_text="title",
    target_text="keywords",
    model_params=model_params,
    model = model,
    tokenizer = tokenizer,
    output_dir = output_dir
)
