In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import os

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

from rich.table import Column, Table
from rich import box
from rich.console import Console

from torch import cuda

device = 'cuda' if cuda.is_available() else 'cpu'

# define a rich console logger
console=Console(record=True)

def display_df(df):
  """display dataframe in ASCII format"""

  console=Console()
  table = Table(Column("source_text", justify="center" ), Column("target_text", justify="center"), title="Sample Data",pad_edge=False, box=box.ASCII)

  for i, row in enumerate(df.values.tolist()):
    table.add_row(row[0], row[1])

  console.print(table)

training_logger = Table(Column("Epoch", justify="center" ), 
                        Column("Steps", justify="center"),
                        Column("Loss", justify="center"), 
                        title="Training Status",pad_edge=False, box=box.ASCII)

class YourDataSetClass(Dataset):
  """
  Creating a custom dataset for reading the dataset and 
  loading it into the dataloader to pass it to the neural network for finetuning the model

  """

  def __init__(self, dataframe, tokenizer, source_len, target_len, source_text, target_text):
    self.tokenizer = tokenizer
    self.data = dataframe
    self.source_len = source_len
    self.summ_len = target_len
    self.target_text = self.data[target_text]
    self.source_text = self.data[source_text]

  def __len__(self):
    return len(self.target_text)

  def __getitem__(self, index):
    source_text = str(self.source_text[index])
    target_text = str(self.target_text[index])

    #cleaning data so as to ensure data is in string type
    source_text = ' '.join(source_text.split())
    target_text = ' '.join(target_text.split())

    source = self.tokenizer.batch_encode_plus([source_text], max_length= self.source_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
    target = self.tokenizer.batch_encode_plus([target_text], max_length= self.summ_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')

    source_ids = source['input_ids'].squeeze()
    source_mask = source['attention_mask'].squeeze()
    target_ids = target['input_ids'].squeeze()
    target_mask = target['attention_mask'].squeeze()

    return {
        'source_ids': source_ids.to(dtype=torch.long), 
        'source_mask': source_mask.to(dtype=torch.long), 
        'target_ids': target_ids.to(dtype=torch.long),
        'target_ids_y': target_ids.to(dtype=torch.long)
    }

def train(epoch, tokenizer, model, device, loader, optimizer):

  """
  Function to be called for training with the parameters passed from main function

  """

  model.train()
  for _,data in enumerate(loader, 0):
    y = data['target_ids'].to(device, dtype = torch.long)
    y_ids = y[:, :-1].contiguous()
    lm_labels = y[:, 1:].clone().detach()
    lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
    ids = data['source_ids'].to(device, dtype = torch.long)
    mask = data['source_mask'].to(device, dtype = torch.long)

    outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
    loss = outputs[0]

    if _%10==0:
      training_logger.add_row(str(epoch), str(_), str(loss))
      console.print(training_logger)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

def validate(epoch, tokenizer, model, device, loader):

  """
  Function to evaluate model for predictions

  """
  model.eval()
  predictions = []
  actuals = []
  with torch.no_grad():
      for _, data in enumerate(loader, 0):
          y = data['target_ids'].to(device, dtype = torch.long)
          ids = data['source_ids'].to(device, dtype = torch.long)
          mask = data['source_mask'].to(device, dtype = torch.long)

          generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=150, 
              num_beams=20,
              repetition_penalty=2.5, 
              length_penalty=1.0, 
              early_stopping=True,
              num_return_sequences = 10
              )
          preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
          target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
          if _%10==0:
              console.print(f'Completed {_}')

          predictions.extend(preds)
          actuals.extend(target)
  return predictions, actuals

def T5Trainer(train_dataset, val_dataset, source_text, target_text, model_params, output_dir="./outputs/" ):
  
  """
  T5 trainer

  """

  # Set random seeds and deterministic pytorch for reproducibility
  torch.manual_seed(model_params["SEED"]) # pytorch random seed
  np.random.seed(model_params["SEED"]) # numpy random seed
  torch.backends.cudnn.deterministic = True

  # logging
  console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

  # tokenzier for encoding the text
  tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

  # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
  # Further this model is sent to device (GPU/TPU) for using the hardware.
  model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
  model = model.to(device)
  
  # logging
  console.log(f"[Data]: Reading data...\n")

  # # Importing the raw dataset
  # dataframe = dataframe[[source_text,target_text]]
  # display_df(dataframe.head(2))

  
  # Creation of Dataset and Dataloader
  # Defining the train size. So 80% of the data will be used for training and the rest for validation. 
  # train_size = 0.8
  # train_dataset=dataframe.sample(frac=train_size,random_state = model_params["SEED"])
  # val_dataset=dataframe.drop(train_dataset.index).reset_index(drop=True)
  # train_dataset = train_dataset.reset_index(drop=True)

  # console.print(f"FULL Dataset: {dataframe.shape}")
  console.print(f"TRAIN Dataset: {train_dataset.shape}")
  console.print(f"TEST Dataset: {val_dataset.shape}\n")


  # Creating the Training and Validation dataset for further creation of Dataloader
  training_set = YourDataSetClass(train_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)
  val_set = YourDataSetClass(val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)


  # Defining the parameters for creation of dataloaders
  train_params = {
      'batch_size': model_params["TRAIN_BATCH_SIZE"],
      'shuffle': True,
      'num_workers': 0
      }


  val_params = {
      'batch_size': model_params["VALID_BATCH_SIZE"],
      'shuffle': False,
      'num_workers': 0
      }


  # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
  training_loader = DataLoader(training_set, **train_params)
  val_loader = DataLoader(val_set, **val_params)


  # Defining the optimizer that will be used to tune the weights of the network in the training session. 
  optimizer = torch.optim.Adam(params =  model.parameters(), lr=model_params["LEARNING_RATE"])


  # Training loop
  console.log(f'[Initiating Fine Tuning]...\n')

  for epoch in range(model_params["TRAIN_EPOCHS"]):
      train(epoch, tokenizer, model, device, training_loader, optimizer)
      
  console.log(f"[Saving Model]...\n")
  #Saving the model after training
  path = os.path.join(output_dir, "model_files")
  model.save_pretrained(path)
  tokenizer.save_pretrained(path)


  # evaluating test dataset
  console.log(f"[Initiating Validation]...\n")
  for epoch in range(model_params["VAL_EPOCHS"]):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
    final_df.to_csv(os.path.join(output_dir,'predictions.csv'))
  
  console.save_text(os.path.join(output_dir,'logs.txt'))
  
  console.log(f"[Validation Completed.]\n")
  console.print(f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n""")
  console.print(f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n""")
  console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")

In [3]:
from torch import cuda
cuda.is_available()

False

In [9]:
import json
from nltk import tokenize
def split_into_sentences(text):
    tmp = tokenize.sent_tokenize(text)
    return tmp[0], ' '.join(tmp[1:])

def wiki_to_fine_tune_data(wiki_path):
    df = pd.DataFrame()
    f = open(wiki_path)
    for line in f:
        try:
            line = json.loads(line)
            first_sentence, completion = split_into_sentences(line['text'])
            prompt = line['title'] + '\n' + first_sentence
            dic = {'prompt': prompt, 'completion': completion}
            df = df.append(dic, ignore_index=True)
        except Exception as e:
            print(e)
    print(len(df))
    df = df.sort_values(by='completion', key = lambda x: x.str.len(), ascending= False)
    return df

In [2]:
model_params={
    "MODEL":"t5-base",             # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE":4,          # training batch size
    "VALID_BATCH_SIZE":16,          # validation batch size
    "TRAIN_EPOCHS":10,              # number of training epochs
    "VAL_EPOCHS":1,                # number of validation epochs
    "LEARNING_RATE":1e-4,          # learning rate
    "MAX_SOURCE_TEXT_LENGTH":100,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH":512,   # max length of target text
    "SEED": 42                     # set seed for reproducibility 
}

# train_data_path = '/scratch/yd2481/wiki/wiki_data/wiki_fine_tune_2.json'
# test_data_path = '/scratch/yd2481/wiki/wiki_data_final_3000.json'
# train_data = wiki_to_fine_tune_data(train_data_path)[:10000]
# test_data = wiki_to_fine_tune_data(test_data_path)[:2000]
# train_data.to_csv('/scratch/yd2481/wiki/wiki_data/wiki_fine_tune.csv')
# test_data.to_csv('/scratch/yd2481/wiki/wiki_data/wiki_test.csv')


train_data = pd.read_csv('/scratch/yd2481/wiki/wiki_data/wiki_fine_tune.csv')
test_data = pd.read_csv('/scratch/yd2481/wiki/wiki_data/wiki_test.csv')
print('data processed')
T5Trainer(train_dataset = train_data, val_dataset = test_data, source_text="prompt", target_text="completion", model_params=model_params, output_dir="/scratch/yd2481/wiki/t5/t5_trained_3rd_time")

data processed


KeyboardInterrupt: 

In [5]:
df = pd.read_csv('/scratch/yd2481/wiki/t5/predictions.csv')

In [31]:
df['Generated Text'][3]

'on Romanticism were characterized by the Marxist-Leninist view of capitalism. Marx and Engels considered the criticisms of capitalism to be "half lamentation, half lampoon; at other times it is a mockery of the present, striking the bourgeoisie to the very heart." Marx and Engels called the criticisms of capitalism "feudal socialism", which they described as "the most virulent critique of capitalism in history": "The most virulent critique of capitalism was that of the past, but also of the present," and thus of the future. Marx and Engels rejected the notions of feudal socialism as an'

In [32]:
df['Actual Text'][3]

'Pyotr Semyonovich Kogan, on the other hand, believed that the Romantics "were, thanks to the strength of their criticism, able to discover many errors of the Enlightenment, which forced progressive writers to proceed more cautiously and not repeat the mistakes of the past." For A. Vishnevsky, "the pathos of Romantic art lies in exposing the disharmony of the modern world, in an unaccountable striving for the integrity of human development and harmonious social relations. However, the struggle against the ugliness and philistinism of capitalist civilization takes on a reactionary-utopian character among the Romantics; illusory dreaminess and inability to sober objective study and depiction of reality are typical of Romantic art in general. These features of art show Romanticism\'s departure from the tasks of realistic art, from the demand for artistic reflection of the real conditions of human historical activity. Due to this, the irrational and religious-mystical principle becomes an 

In [33]:
test_data['prompt'][3]

'Marxist-Leninist views on Romanticism\nKarl Marx and Friedrich Engels considered the Romantic-aristocratic critiques of capitalism as belonging to the current they called feudal socialism: "half lamentation, half lampoon; half an echo of the past, half menace of the future; at times, by its bitter, witty and incisive criticism, striking the bourgeoisie to the very heart’s core; but always ludicrous in its effect, through total incapacity to comprehend the march of modern history."'

In [22]:
train_data['prompt'][0]

'Barrett House (Poughkeepsie, New York)\nBarrett House is a historic home located at Poughkeepsie, Dutchess County, New York, today home to Barrett Art Center.'

In [35]:
train_data['completion'][0]

"This triple-landmark  (National, State, and municipal) Greek Revival brick townhouse was built in the early 1840s. The Barrett House reflects three phases of construction. The original building is a ca. 1842 three-story, three-bay by four-bay Greek Revival brick house with a side-gabled, stepped roof. A two-story, three-bay by two-bay, front-gabled brick addition was constructed to its rear ca. 1867. In the twentieth century, Barrett House achieved notoriety as the family home of Poughkeepsie-born WPA muralist Thomas Weeks Barrett. Jr. (1902-1947), who founded the Dutchess County Art Association (DCAA) in 1935 and lived there until his death in 1947. His artwork, family archive, and DCAA records remain in the house today. Thomas W. Barrett, Jr. graduated from the School of the Museum of Fine Arts, Boston in 1926, but his energies and artwork centered on the Hudson Valley. Barrett worked professionally as a designer, painter, printmaker, and as a muralist for the Treasury Relief Art Pr