In [1]:
!pip install -r requirements.txt

Collecting transformers
  Using cached transformers-4.20.1-py3-none-any.whl (4.4 MB)
Collecting pytorch-lightning==1.2.7
  Using cached pytorch_lightning-1.2.7-py3-none-any.whl (830 kB)
Collecting seaborn
  Using cached seaborn-0.11.2-py3-none-any.whl (292 kB)
Collecting wandb
  Using cached wandb-0.12.21-py2.py3-none-any.whl (1.8 MB)
Collecting sklearn
  Using cached sklearn-0.0.tar.gz (1.1 kB)
Collecting pandas
  Using cached pandas-1.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.6 MB)
[31mERROR: Could not find a version that satisfies the requirement json (from versions: none)[0m
[31mERROR: No matching distribution found for json[0m
You should consider upgrading via the '/home/users1/musenips/master_venv/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import json
import pandas as pd
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.loggers import WandbLogger
from sklearn.model_selection import train_test_split
import textwrap

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer
)
from tqdm.auto import tqdm

In [3]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
rcParams['figure.figsize'] = 16, 10

In [4]:
pl.seed_everything(42)

Global seed set to 42


42

Commands for preparing data

In [5]:
# !git clone https://gitlab.com/shimorina/webnlg-dataset.git

In [6]:
# !mv 'webnlg-dataset/release_v3.0/en' 'webnlg_data'

In [7]:
# !rm -r --interactive=never 'webnlg-dataset'

Commands for removing data

In [8]:
# !rm -r --interactive=never 'webnlg_data'

In [9]:
# !rm web*\.csv

In [10]:
import glob
import os
import re
import xml.etree.ElementTree as ET
import pandas as pd

train_path = "train/**/*"
dev_path = "dev/**/*"
test_path = "test/rdf-to-text-generation-test-data-with-refs-en"
sets = [train_path, dev_path, test_path]

for s in sets:
    files = glob.glob(os.getcwd() + "/webnlg_data/" + s + ".xml", recursive=True)    
    inputs = list()
    outputs = list()
    for file in files:
        tree = ET.parse(file)
        root = tree.getroot()    
        for sub_root in root:
            for entry in sub_root:
                for element in entry:
                    if "modifiedtripleset" in str(element):
                        inp = " && ".join([triple.text for triple in element])
                        # inp = " && ".join(["|".join([e.strip("\"") for e in triple.text.split("|")]) for triple in element])
                    if "lex" in str(element):
                        out = element.text
                        inputs.append(inp)
                        outputs.append(out)

    mdata_dct={"input_text":[], "target_text":[]}
    for i, _ in enumerate(inputs):
        mdata_dct['input_text'].append(inputs[i])
        mdata_dct['target_text'].append(outputs[i])


    df=pd.DataFrame(mdata_dct)
    df.to_csv('webNLG2020_' + s.split("/")[0] + '.csv', index=False)

In [11]:
TRAIN_PATH = 'webNLG2020_train.csv'
DEV_PATH = 'webNLG2020_dev.csv'
TEST_PATH = 'webNLG2020_test.csv'

In [12]:
train_df = pd.read_csv(TRAIN_PATH)
dev_df = pd.read_csv(DEV_PATH)
test_df = pd.read_csv(TEST_PATH)

In [13]:
full_data = pd.concat([train_df, dev_df, test_df])
full_data.head()

Unnamed: 0,input_text,target_text
0,AmeriGas | country | United_States && AmeriGas...,AmeriGas was founded in the United States on 1...
1,AmeriGas | country | United_States && AmeriGas...,"AmeriGas, founded on 01-01-1959, works in the ..."
2,AmeriGas | country | United_States && AmeriGas...,"AmeriGas, whose home country is the United Sta..."
3,AmeriGas | country | United_States && AmeriGas...,AmeriGas is based in the United States and emp...
4,AmeriGas | country | United_States && AmeriGas...,"AmeriGas, whose current employment is 8,500, h..."


In [14]:
print(train_df.shape, dev_df.shape, test_df.shape)

(35426, 2) (4464, 2) (5150, 2)


In [15]:
MODEL_NAME = "t5-small"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

In [16]:
rdf_lens = list()
summary_lens = list()
for idx, row in full_data.iterrows():
    rdf_encoding = tokenizer(row["input_text"])
    rdf_lens.append(len(rdf_encoding.tokens()))
    summary_encoding = tokenizer(row["target_text"])
    summary_lens.append(len(summary_encoding.tokens()))

In [17]:
# print(max(rdf_lens),max(summary_lens))

In [18]:
!wandb login '47616056d2ebbf7dea86db60a5fc58145cd234fc'  ## andere Lösung finden

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/users1/musenips/.netrc


In [19]:
class WebNLGDataset(Dataset):
    
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: T5Tokenizer,
        text_max_token_len: int = 263,
        summary_max_token_len: int = 147
    ):
        
        self.tokenizer = tokenizer
        self.data = data
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]
        
        text = data_row['input_text']
        
        text_encoding = tokenizer(
            text,
            max_length=self.text_max_token_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )
        
        summary_encoding = tokenizer(
            data_row["target_text"],
            max_length=self.text_max_token_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )
        
        labels = summary_encoding["input_ids"]
        labels[labels == 0] = -100
        
        return dict(
            text=text,
            summary=data_row["target_text"],
            text_input_ids=text_encoding["input_ids"].flatten(),
            text_attention_mask=text_encoding["attention_mask"].flatten(),
            labels=labels.flatten(),
            labels_attention_mask=summary_encoding["attention_mask"].flatten()
        )

In [20]:
class WebNLGDataModule(pl.LightningDataModule):
    
    def __init__(
        self,
        train_df: pd.DataFrame,
        dev_df: pd.DataFrame,
        test_df: pd.DataFrame,
        tokenizer: T5Tokenizer,
        batch_size: int = 8,
        text_max_token_len: int = 263,
        summary_max_token_len: int = 147
    ):
        
        super().__init__()
        
        self.train_df = train_df
        self.dev_df = dev_df
        self.test_df = test_df
        
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len
        
    def setup(self, stage=None):
        self.train_dataset = WebNLGDataset(
            self.train_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len
        )
        self.test_dataset = WebNLGDataset(
            self.test_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len
        )
        self.dev_dataset = WebNLGDataset(
            self.dev_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len
        )
        
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=56
        )
        
    def val_dataloader(self):
        return DataLoader(
            self.dev_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=56
        )

    def test_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=56
        )

In [21]:
N_EPOCHS = 2
BATCH_SIZE = 8


data_module = WebNLGDataModule(train_df, test_df, dev_df, tokenizer, batch_size=BATCH_SIZE)

In [22]:
class WebNLGModel(pl.LightningModule):
    
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)
        
    def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
        
        output = self.model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels,
            decoder_attention_mask=decoder_attention_mask
        )
        
        return output.loss, output.logits
    
    def training_step(self, batch, batch_idx):
        input_ids = batch["text_input_ids"]
        attention_mask = batch["text_attention_mask"]
        labels = batch["labels"]
        labels_attention_mask = batch["labels_attention_mask"]
        
        loss, outputs = self(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=labels_attention_mask,
            labels=labels
        )
        
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        input_ids = batch["text_input_ids"]
        attention_mask = batch["text_attention_mask"]
        labels = batch["labels"]
        labels_attention_mask = batch["labels_attention_mask"]
        
        loss, outputs = self(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=labels_attention_mask,
            labels=labels
        )
        
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        input_ids = batch["text_input_ids"]
        attention_mask = batch["text_attention_mask"]
        labels = batch["labels"]
        labels_attention_mask = batch["labels_attention_mask"]
        
        loss, outputs = self(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=labels_attention_mask,
            labels=labels
        )
        
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.0001)         ####

In [23]:
model = WebNLGModel()

In [24]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)

logger = WandbLogger('WebNLG')

trainer = pl.Trainer(
    logger=logger,
    checkpoint_callback=checkpoint_callback,
    max_epochs=N_EPOCHS,
    gpus=1,                                     ###############
    progress_bar_refresh_rate=30
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [25]:
trainer.fit(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
[34m[1mwandb[0m: Currently logged in as: [33mpavl_os[0m. Use [1m`wandb login --relogin`[0m to force relogin



  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 258.00 MiB (GPU 0; 11.93 GiB total capacity; 2.14 GiB already allocated; 123.56 MiB free; 2.40 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# "model.h5" is saved in wandb.run.dir & will be uploaded at the end of training
model.save(os.path.join(wandb.run.dir, "model.h5"))

In [None]:
wandb.finish()

In [None]:
trained_model = WebNLGModel.load_from_checkpoint(
   trainer.checkpoint_callback.best_model_path
)

trained_model.freeze()

In [None]:
trainer.checkpoint_callback.best_model_path

In [None]:
def summarize(text):
    text_encoding = tokenizer(
        text,
        max_length=263,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt"
    )
    
    generated_ids = trained_model.model.generate(
        input_ids=text_encoding["input_ids"],
        attention_mask=text_encoding["attention_mask"],
        max_length=147,
        num_beams=2,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True
    )
        
    preds = [
        tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        for gen_id in generated_ids
    ]
        
    return "".join(preds)

In [None]:
sample_row = test_df.iloc[0]
text = sample_row["input_text"]
model_summary = summarize(text)

In [None]:
text

In [None]:
sample_row["target_text"]

In [None]:
model_summary

In [None]:
sample_row = test_df.iloc[2]
text = sample_row["input_text"]
model_summary = summarize(text)

In [None]:
text

In [None]:
sample_row["target_text"]

In [None]:
model_summary