In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import os
import re
import pandas as pd
import xml.etree.ElementTree as ET
import numpy as np
import torch
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
import time
import random

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

In [2]:
tokenizer = AutoTokenizer.from_pretrained("google/mt5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base")

In [3]:
class webnlg(Dataset):
    def __init__(self, tokenizer, path_data,
                 save_df=True, save_dataset=True,
                 load_path_df="", load_path_dataset="",
                 keep_df=True, verbose=True):
        self.verbose = verbose
        self.tokenizer = tokenizer

        if load_path_df:
            self.df = pd.read_csv(load_path_df)
        elif load_path_dataset and not keep_df:
            pass # no need to load nor compute df
        else:
            self.df = None
            self.make_df(path_data, save_df)

        if load_path_dataset:
            self.dataset = np.load(load_path_dataset, allow_pickle=True)
        else:
            self.dataset = None
            self.make_dataset(path_data, save_dataset)
        if not keep_df:
            del self.df

    def __len__(self):
        return self.dataset.shape[0]

    def clean_text(self, text, type=""):
        if type == "RDF":
            text = "<S> " + text
            text = text.replace("|", "<P>", 1)
            text = text.replace("|", "<O>", 1)
        text = text.replace('"', "")
        text = text.replace("_", " ")
        text = re.sub(r"\s+", " ", text)
        for letters in re.findall(r"[a-z][A-Z][a-z]", text):
            text = re.sub(r"[a-z][A-Z][a-z]", letters[0] + " " + letters[1:], text)
        return text

    def make_df(self, path_data, save_df=True):
        if self.verbose:
            print("Make DataFrame")
        t_total = time.time()
        count_1 = 0
        total_count_1 = len(os.listdir(path_data))
        df = pd.DataFrame(columns=["RDF", "sequence", "RDF_seq_tokenized"])
        for ktriples in os.listdir(path_data):
            count_1 += 1
            path_ktriples = os.path.join(path_data, ktriples)
            t_sub = time.time()
            count_2 = 0
            total_count_2 = len(os.listdir(path_ktriples))
            for file in os.listdir(path_ktriples):
                tree = ET.parse(os.path.join(path_ktriples, file))
                root = tree.getroot()
                for entry in root[0]:
                    triple_all = "Generate in English:"
                    for modifiedtripletset in entry.findall("modifiedtripleset"):
                        for mtriple in modifiedtripletset.findall("mtriple"):
                            triple = self.clean_text(mtriple.text, "RDF")
                            triple_all += " " + triple
                    triple_all_tokenized = tokenizer.encode_plus(triple_all,
                                                                 return_tensors='pt',
                                                                 max_length=512,
                                                                 padding="max_length",
                                                                 truncation=True)
                    for lex in entry.findall("lex"):
                        seq = self.clean_text(lex.text)
                        seq_tokenized = tokenizer.encode_plus(seq,
                                                              return_tensors='pt',
                                                              max_length=512,
                                                              padding="max_length",
                                                              truncation=True)
                        df.loc[len(df)] = [triple_all, seq, [triple_all_tokenized, seq_tokenized]]
                if self.verbose:
                    count_2 += 1
                    print("\t{}/{} - {:.1f}% - elapsed : {:.1f}s - total : {:.1f}s".format(count_1,
                                                                                         total_count_1,
                                                                                         count_2 / total_count_2 * 100,
                                                                                         time.time() - t_sub,
                                                                                         time.time() - t_total),
                          end="\r")
            if self.verbose:
                print("")
        if save_df:
            if self.verbose:
                print("")
                print("Saving... ",end="")
            df.to_csv(path_data + "_dataframe.csv", index=False)
            if self.verbose:
                print("DataFrame {} saved.".format(path_data + "_dataframe.csv"))
        self.df = df
        if self.verbose:
            print("\tDone. elapsed : {:.1f}s\n".format(time.time()-t_total))

    def make_dataset(self, path_data, save_dataset):
        t = time.time()
        if self.verbose:
            print("Make Dataset")
        self.dataset = self.df["RDF_seq_tokenized"].to_numpy()
        if save_dataset:
            if self.verbose:
                print("")
                print("Saving... ",end="")
            np.save(path_data + "_dataset.npy", self.dataset)
            if self.verbose:
                print("Dataset {} saved.".format(path_data + "_dataset.npy"))
        if self.verbose:
            print("\tDone. elapsed : {:.1f}s\n".format(time.time()-t))

    def __getitem__(self, index):
        source, targets = self.convert_to_features(self.dataset[index])
        source_ids = source["input_ids"].squeeze()
        target_ids = targets["input_ids"].squeeze()
        src_mask = source["attention_mask"].squeeze()
        target_mask = targets["attention_mask"].squeeze()
        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}

source :
    https://shivanandroy.com/fine-tune-t5-transformer-with-pytorch/
    https://github.com/Shivanandroy/T5-Finetuning-PyTorch

class T5_fine_tuned_dtt(torch.nn):
    def __init__(self)
    def train(epoch, tokenizer, model, device, loader, optimizer)
    def validate(epoch, tokenizer, model, device, loader)
    def T5Trainer(dataframe, source_text, target_text, model_params, output_dir="./outputs/")

In [4]:
train_dataset = webnlg(tokenizer, "./webnlg-dataset-master-release_v3.0/en/train", verbose=True)

Make DataFrame
	1/7 - 100.0% - elapsed : 14.9s - total : 14.9s
	2/7 - 100.0% - elapsed : 1.2s - total : 16.1s
	3/7 - 100.0% - elapsed : 17.4s - total : 33.5s
	4/7 - 100.0% - elapsed : 1.4s - total : 34.9s
	5/7 - 100.0% - elapsed : 14.8s - total : 49.7s
	6/7 - 100.0% - elapsed : 26.0s - total : 75.7s
	7/7 - 100.0% - elapsed : 27.4s - total : 103.1s

Saving... DataFrame ./webnlg-dataset-master-release_v3.0/en/train_dataframe.csv saved.
	Done. elapsed : 230.6s

Make Dataset

Saving... Dataset ./webnlg-dataset-master-release_v3.0/en/train_dataset.npy saved.
	Done. elapsed : 6.4s



# WORK IN PROGRESS

In [None]:
class T5_fine_tuned_dtt(torch.nn):
    def __init__(self):
        # let's define model parameters specific to T5
        model_params = {
            "MODEL": "t5-base",  # model_type: t5-base/t5-large
            "TRAIN_BATCH_SIZE": 8,  # training batch size
            "VALID_BATCH_SIZE": 8,  # validation batch size
            "TRAIN_EPOCHS": 3,  # number of training epochs
            "VAL_EPOCHS": 1,  # number of validation epochs
            "LEARNING_RATE": 1e-4,  # learning rate
            "MAX_SOURCE_TEXT_LENGTH": 512,  # max length of source text
            "MAX_TARGET_TEXT_LENGTH": 50,  # max length of target text
            "SEED": 42,  # set seed for reproducibility
        }

        T5Trainer(
            dataframe=df,
            source_text="text",
            target_text="headlines",
            model_params=model_params,
            output_dir="outputs",
        )

    def train(epoch, tokenizer, model, device, loader, optimizer):

        """
        Function to be called for training with the parameters passed from main function

        """

        model.train()
        for _, data in enumerate(loader, 0):
            y = data["target_ids"].to(device, dtype=torch.long)
            y_ids = y[:, :-1].contiguous()
            lm_labels = y[:, 1:].clone().detach()
            lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
            ids = data["source_ids"].to(device, dtype=torch.long)
            mask = data["source_mask"].to(device, dtype=torch.long)

            outputs = model(
                input_ids=ids,
                attention_mask=mask,
                decoder_input_ids=y_ids,
                labels=lm_labels,
            )
            loss = outputs[0]

            if _ % 10 == 0:
                training_logger.add_row(str(epoch), str(_), str(loss))
                console.print(training_logger)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    def validate(epoch, tokenizer, model, device, loader):

        """
        Function to evaluate model for predictions

        """
        model.eval()
        predictions = []
        actuals = []
        with torch.no_grad():
          for _, data in enumerate(loader, 0):
              y = data['target_ids'].to(device, dtype = torch.long)
              ids = data['source_ids'].to(device, dtype = torch.long)
              mask = data['source_mask'].to(device, dtype = torch.long)

              generated_ids = model.generate(
                  input_ids = ids,
                  attention_mask = mask,
                  max_length=150,
                  num_beams=2,
                  repetition_penalty=2.5,
                  length_penalty=1.0,
                  early_stopping=True
                  )
              preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
              target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
              if _%10==0:
                  console.print(f'Completed {_}')

              predictions.extend(preds)
              actuals.extend(target)
        return predictions, actuals

    def T5Trainer(dataframe, source_text, target_text, model_params, output_dir="./outputs/"):

        """
        T5 trainer

        """

        # Set random seeds and deterministic pytorch for reproducibility
        torch.manual_seed(model_params["SEED"])  # pytorch random seed
        np.random.seed(model_params["SEED"])  # numpy random seed
        torch.backends.cudnn.deterministic = True

        # logging
        console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

        # tokenzier for encoding the text
        tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

        # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary.
        # Further this model is sent to device (GPU/TPU) for using the hardware.
        model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
        model = model.to(device)

        # logging
        console.log(f"[Data]: Reading data...\n")

        # Importing the raw dataset
        dataframe = dataframe[[source_text, target_text]]
        display_df(dataframe.head(2))

        # Creation of Dataset and Dataloader
        # Defining the train size. So 80% of the data will be used for training and the rest for validation.
        train_size = 0.8
        train_dataset = dataframe.sample(frac=train_size, random_state=model_params["SEED"])
        val_dataset = dataframe.drop(train_dataset.index).reset_index(drop=True)
        train_dataset = train_dataset.reset_index(drop=True)

        console.print(f"FULL Dataset: {dataframe.shape}")
        console.print(f"TRAIN Dataset: {train_dataset.shape}")
        console.print(f"TEST Dataset: {val_dataset.shape}\n")

        # Creating the Training and Validation dataset for further creation of Dataloader
        training_set = YourDataSetClass(
            train_dataset,
            tokenizer,
            model_params["MAX_SOURCE_TEXT_LENGTH"],
            model_params["MAX_TARGET_TEXT_LENGTH"],
            source_text,
            target_text,
        )
        val_set = YourDataSetClass(
            val_dataset,
            tokenizer,
            model_params["MAX_SOURCE_TEXT_LENGTH"],
            model_params["MAX_TARGET_TEXT_LENGTH"],
            source_text,
            target_text,
        )

        # Defining the parameters for creation of dataloaders
        train_params = {
            "batch_size": model_params["TRAIN_BATCH_SIZE"],
            "shuffle": True,
            "num_workers": 0,
        }

        val_params = {
            "batch_size": model_params["VALID_BATCH_SIZE"],
            "shuffle": False,
            "num_workers": 0,
        }

        # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
        training_loader = DataLoader(training_set, **train_params)
        val_loader = DataLoader(val_set, **val_params)

        # Defining the optimizer that will be used to tune the weights of the network in the training session.
        optimizer = torch.optim.Adam(
            params=model.parameters(), lr=model_params["LEARNING_RATE"]
        )

        # Training loop
        console.log(f"[Initiating Fine Tuning]...\n")

        for epoch in range(model_params["TRAIN_EPOCHS"]):
            train(epoch, tokenizer, model, device, training_loader, optimizer)

        console.log(f"[Saving Model]...\n")
        # Saving the model after training
        path = os.path.join(output_dir, "model_files")
        model.save_pretrained(path)
        tokenizer.save_pretrained(path)

        # evaluating test dataset
        console.log(f"[Initiating Validation]...\n")
        for epoch in range(model_params["VAL_EPOCHS"]):
            predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
            final_df = pd.DataFrame({"Generated Text": predictions, "Actual Text": actuals})
            final_df.to_csv(os.path.join(output_dir, "predictions.csv"))

        console.save_text(os.path.join(output_dir, "logs.txt"))

        console.log(f"[Validation Completed.]\n")
        console.print(
            f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n"""
        )
        console.print(
            f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n"""
        )
        console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")