In [1]:
import os
import json
import shutil
import pandas as pd
import numpy as np
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import torch

In [2]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from transformers import AdamW, T5ForConditionalGeneration, T5Tokenizer
from IPython.display import Markdown, display

In [4]:
torch.cuda.device_count()

1

In [5]:
torch.cuda.empty_cache()

In [6]:
pl.seed_everything(123)

Global seed set to 123


123

In [8]:
MODEL_NAME='t5-small'
EPOCHS = 1
BATCH_SIZE = 8

In [9]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [10]:
label_type = 'resolution-order-labels'

In [11]:
class LEGODataset(Dataset):
    def __init__(self,
        data: pd.DataFrame,
        tokenizer: T5Tokenizer,
        source_max_token_length: int=100,
        target_max_token_length: int=100,
        ):
        self.data = data
        self.tokenizer = tokenizer
        self.source_max_token_length = source_max_token_length
        self.target_max_token_length = target_max_token_length
  
    def __len__(self):
        return len(self.data)
  
    def __getitem__(self, index: int):
        row = self.data.iloc[index]
        source_encoding = self.tokenizer(
            row['input'],
            max_length=self.source_max_token_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        target_encoding = self.tokenizer(
        row[label_type],
        max_length=self.target_max_token_length,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
        )
        labels = target_encoding['input_ids']
        labels[labels==tokenizer.pad_token_id]=-100
        out = dict(
            source=row['input'],
            target=row[label_type],
            input_ids=source_encoding['input_ids'].flatten(),
            attention_mask=source_encoding['attention_mask'].flatten(),
            labels=labels.flatten(),
            len_source=len(source_encoding['input_ids'].flatten()),
            len_target=len(labels.flatten())
                   ) 
        return out

In [12]:
class LEGODataModule(pl.LightningDataModule):
    def __init__(self, 
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
        source_max_token_length: int,
        target_max_token_length: int,
        tokenizer: T5Tokenizer,
        batch_size: int=16, 
        ):
        super().__init__()
        self.train_df = train_df
        self.test_df = test_df
        self.source_max_token_length = source_max_token_length
        self.target_max_token_length = target_max_token_length
        self.tokenizer = tokenizer
        self.batch_size = batch_size
  
    def setup(self):
        self.train_dataset = LEGODataset(
            data=self.train_df, 
            tokenizer=self.tokenizer,
            source_max_token_length=self.source_max_token_length,
            target_max_token_length=self.target_max_token_length)
        self.test_dataset = LEGODataset(
            data=self.test_df, 
            tokenizer=self.tokenizer,
            source_max_token_length=self.source_max_token_length,
            target_max_token_length=self.target_max_token_length)
  
    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                   batch_size=self.batch_size,
                   shuffle=True, 
                   num_workers=2)
  
    def val_dataloader(self):
        return DataLoader(self.test_dataset,
                   batch_size=1,
                   shuffle=True, 
                   num_workers=2)

In [13]:
from IPython.core.display import ProgressBar
class LEGOTask(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return output.loss, output.logits
  
    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, logits = self(input_ids, attention_mask, labels)
        self.log("Training loss", loss, prog_bar=True, logger=True)
        return loss
  
    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, logits = self(input_ids, attention_mask, labels)
        self.log("Validation loss", loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, logits = self(input_ids, attention_mask, labels)
        self.log("Test loss", loss, prog_bar=True, logger=True)
        return loss   

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.0002)

In [14]:
class Experimenting(object):
    def __init__(self, results_base_dir='../../lego-results', data_dict=None, source_max_token_length=100, target_max_token_length=100):
        self.results_base_dir = results_base_dir
        self.data_dict = data_dict
        self.setup_name = self.data_dict['setup_name']
        self.num_samples = len(self.data_dict['tr_paths'])
        self.source_max_token_length = source_max_token_length
        self.target_max_token_length = target_max_token_length
        self.tr_te_path_pairs = []
        self.train_dfs = []
        self.test_dfs = []
        self.checkpoints_path = None
        self.trained_models_paths = []
        
        if not os.path.isdir(self.results_base_dir):
            os.mkdir(self.results_base_dir)
        self.checkpoints_path = os.path.join(self.results_base_dir, self.setup_name + '-checkpoints')
        if os.path.isdir(self.checkpoints_path):
            shutil.rmtree(self.checkpoints_path)
            os.mkdir(self.checkpoints_path)
        else:
            os.mkdir(self.checkpoints_path)
        
        print("Experiment name: ", self.setup_name)
        print("Starting to load datasets ...")
        for i in range(self.num_samples):
            tr_path, te_path = self.data_dict['tr_paths'][i], self.data_dict['te_paths'][i]
            self.tr_te_path_pairs.append((i, tr_path, te_path))
            train_df = pd.read_csv(tr_path, encoding='utf-8', index_col=0)
            #train_df['target_text'] = train_df['target_text'].map(str)
            print("Training path: ", tr_path)
            print("Training set size: ", len(train_df))
            self.train_dfs.append(train_df)
            test_df = pd.read_csv(te_path, encoding='utf-8', index_col=0)
            #test_df['target_text'] = test_df['target_text'].map(str)
            print("Test path: ", te_path)
            print("Test set size: ", len(test_df))
            self.test_dfs.append(test_df)
            print("Done loading datasets for sample num {} ...".format(i))
        print("Done loading all datasets!")    
            
    def run_setup(self):
        for i in range(self.num_samples):
            train_df, test_df = self.train_dfs[i], self.test_dfs[i]
        
            data_module = LEGODataModule(train_df=train_df, test_df=test_df, tokenizer=tokenizer, source_max_token_length=self.source_max_token_length, target_max_token_length=self.target_max_token_length, batch_size=BATCH_SIZE)
            data_module.setup()
            
            print("A sample training data: ", data_module.train_dataset[0])
            print("A sample test data: ", data_module.test_dataset[0])
            print("A sample train data tokenized and decoded back: ")
            print("---Input size: ", len(data_module.train_dataset[0]['input_ids']))
            for j, t0 in enumerate(data_module.train_dataset[0]['input_ids']) :
                print("index: {}-token-id: {} -->token: {}".format(j+1, t0.item(), tokenizer.decode(t0, clean_up_tokenization_spaces=True))) 
            print("------------")
            print("---Label size: ", len(data_module.train_dataset[0]['labels']))
            for k, t1 in enumerate(data_module.train_dataset[0]['labels']) :
                print("index: {}-token-id: {} --> token: {}".format(k+1, t1.item(), tokenizer.decode(t1, clean_up_tokenization_spaces=True))) 
            
            final_model = LEGOTask()
            checkpoint_callback = ModelCheckpoint(
            dirpath=self.checkpoints_path,
            filename='best-checkpoint-sample-num-{}'.format(i),
            save_top_k=1,
            verbose=True,
            monitor='Validation loss',
            mode='min'
            )
            train_logs_path = os.path.join(self.checkpoints_path, "training-logs-samp-num-{}".format(i))
            logger = TensorBoardLogger(train_logs_path, name="Current task")
            trainer = pl.Trainer(
            logger=logger,
            callbacks=[checkpoint_callback],
            max_epochs=EPOCHS,
            gpus=1,
            progress_bar_refresh_rate=30
            )
            trainer.fit(final_model, data_module)
            self.trained_models_paths.append((i, os.path.join(self.checkpoints_path, 'best-checkpoint-sample-num-{}.ckpt'.format(i))))
        return self.trained_models_paths

    def get_correctness_degree(self, x, y):
        if x==y:
            return 'correct', 1.
        else:
            if len(x) < len(y):
                smaller = x
                larger = y
            else:
                smaller = y
                larger = x
            overlap = 0.
            for i in range(len(smaller)):
                if smaller[i]==larger[i]:
                    overlap+=1.
            if overlap==0.:
                return 'wrong', 0.
            else:
                return 'partial', overlap / len(larger) 
            
    def evaluate(self):
        setup_evaluation_results = []
        for i, trained_model_path in self.trained_models_paths: 
            test_df = self.test_dfs[0]

            print("Loading from best checkpoint for sample {} stored at {} ".format(i,  trained_model_path))
            trained_model = LEGOTask.load_from_checkpoint(trained_model_path)
            trained_model.freeze()
            
            val_dataset = LEGODataset(data=test_df, tokenizer=tokenizer)
            val_recs, val_targets = [], []
            for j in range(len(test_df)):
                val_recs.append(val_dataset[j]['source'])
                val_targets.append(val_dataset[j]['target'])
            print("Size of the val recs: ", len(val_recs))     
            val_recs = val_recs[:500]
            val_targets = val_targets[:500]
            val_encoding = tokenizer(val_recs, max_length=self.source_max_token_length, padding='max_length', return_attention_mask=True, add_special_tokens=True, return_tensors="pt")
            val_generated_ids = trained_model.model.generate(input_ids=val_encoding['input_ids'], attention_mask=val_encoding['attention_mask'], max_length=self.target_max_token_length)
            preds = [tokenizer.decode(gen_id, clean_up_tokenization_spaces=True, skip_special_tokens=True) for gen_id in val_generated_ids]
            #print(preds)

            correct, partially_correct, totally_wrong = 0, 0, 0
            correct_cases, partially_correct_cases, totally_wrong_cases = [], [], [] 
            for r in range(len(preds)):
                case = {'indx' : r, 'source' : val_dataset[r]['source'], 'target': val_dataset[r]['target'], 'pred': preds[r]}
                judgment, overlap_frac = self.get_correctness_degree(preds[r], val_targets[r])
                if judgment=='correct':
                    correct += 1.
                    correct_cases.append(case)
                elif judgment=='wrong':
                    totally_wrong += 1.
                    totally_wrong_cases.append(case)
                else:
                    partially_correct += 1.
                    partially_correct_cases.append(case)
        
            setup_evaluation_results.append({'setup_name' : self.setup_name, 'sample_num': i, "Accuracy" : round(float(correct) / len(preds) * 100., 1), "correct_cases" : correct_cases, "partially_correct_percentage" : round(float(partially_correct) / len(preds) * 100., 1), "partially_correct_cases" : partially_correct_cases, "totally_wrong_percentage" : round(float(totally_wrong) / len(preds) * 100., 1), "totally_wrong_cases" : totally_wrong_cases})        
            print("Accuracy for sample {}: {}%".format(i, round(float(correct) / len(preds)*100., 1)))    
            print("Finished evaluating setup {} for sample {}.".format(self.setup_name, i))
            print("---------------------------------------------------------------")
        print("Writing the results of evaluating the setup to json ...")
        output_file = open(os.path.join(self.results_base_dir, 'eval-results-for-setup-{}.json'.format(self.setup_name)), 'w', encoding='utf-8')
        for dic in setup_evaluation_results:
            json.dump(dic, output_file) 
            output_file.write('\n')
        print("Done!")     
        return setup_evaluation_results
    
    def infer(self, input_pair, path_to_model_checkpoint):
        input_df = pd.DataFrame([input_pair], columns=['input', label_type])
        trained_model = LEGOTask.load_from_checkpoint(path_to_model_checkpoint)
        trained_model.freeze()
        val_set = LEGODataset(data=input_df, tokenizer=tokenizer)
        val_rec = val_set[0]['source']
        val_encoding = tokenizer(val_rec, max_length=self.source_max_token_length, padding='max_length', return_attention_mask=True, add_special_tokens=True,return_tensors="pt")
        val_generated_ids = trained_model.model.generate(input_ids=val_encoding['input_ids'], attention_mask=val_encoding['attention_mask'], max_length=self.target_max_token_length)
        preds = [tokenizer.decode(gen_id, clean_up_tokenization_spaces=True, skip_special_tokens=True) for gen_id in val_generated_ids]
        return preds

### Model training 

In [15]:
data_dict = {'tr_paths': ['../../lego-data/lego-train.txt'],
             'te_paths': ['../../lego-data/lego-test.txt'],
             'setup_name': 'lego-default-setting'}

In [16]:
exp = Experimenting(results_base_dir='../../lego-results', data_dict=data_dict, source_max_token_length=35, target_max_token_length=25)

Experiment name:  lego-default-setting
Starting to load datasets ...
Training path:  ../../lego-data/lego-train.txt
Training set size:  6800
Test path:  ../../lego-data/lego-test.txt
Test set size:  1700
Done loading datasets for sample num 0 ...
Done loading all datasets!


In [17]:
exp.run_setup()

A sample training data:  {'source': 't=-q; r=+t; a=+p; q=+s; p=-r; s=1', 'target': 's=1;q=1;t=-1;r=-1;p=1;a=1', 'input_ids': tensor([   3,   17, 2423,   18, 1824,  117,    3,   52, 2423, 1220,   17,  117,
           3,    9, 2423, 1220,  102,  117,    3, 1824, 2423, 1220,    7,  117,
           3,  102, 2423,   18,   52,  117,    3,    7, 2423,  536,    1]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'labels': tensor([   3,    7, 2423,  536,  117, 1824, 2423,  536,  117,   17, 2423, 2292,
         117,   52, 2423, 2292,  117,  102, 2423,  536,  117,    9, 2423,  536,
           1]), 'len_source': 35, 'len_target': 25}
A sample test data:  {'source': 'n=-1; o=-n; t=+o; e=-i; i=-m; m=+t', 'target': 'n=-1;o=1;t=1;m=1;i=-1;e=1', 'input_ids': tensor([   3,   29, 2423, 2292,  117,    3,   32, 2423,   18,   29,  117,    3,
          17, 2423, 1220,   32,  117,    3,   15, 2423,   18,   23,  117, 

  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  f"DataModule.{name} has already been called, so it will not be called again. "
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Missing logger folder: ../../lego-results/lego-default-setting-checkpoints/training-logs-samp-num-0/Current task

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
Global seed set to 123
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
Epoch 0, global step 849: Validation loss reached 0.00011 (best 0.00011), saving model to "/mnt/task_runtime/lego-results/lego-default-setting-checkpoints/best-checkpoint-sample-num-0.ckpt" as top 1


[(0,
  '../../lego-results/lego-default-setting-checkpoints/best-checkpoint-sample-num-0.ckpt')]

In [19]:
sss = exp.evaluate()

Loading from best checkpoint for sample 0 stored at ../../lego-results/lego-default-setting-checkpoints/best-checkpoint-sample-num-0.ckpt 
Size of the val recs:  1700
Accuracy for sample 0: 100.0%
Finished evaluating setup lego-default-setting for sample 0.
---------------------------------------------------------------
Writing the results of evaluating the setup to json ...
Done!


In [22]:
#!cat ../../lego-results/eval-results-for-setup-lego-default-setting.json 

In [20]:
num_cases = 20
def pprint_entry(entries, name):
    print("-"+name+":")
    if not entries:
        print("NONE")
    for i, entry in enumerate(entries):
        print("{}.".format(i+1)) 
        print('---Source:     {}'.format(entry['source']))
        print('---Target:     {}'.format(entry['target']))
        print('---Prediction: {}'.format(entry['pred'])) 
    print()
              
              
with open('../../lego-results/eval-results-for-setup-lego-default-setting.json', 'r', encoding='utf-8') as input_file:
    for line in input_file:
        dic=json.loads(line)
        printbold("Sample number: {}".format(dic['sample_num']))
        print("-Accuracy: {}%".format(dic['Accuracy']))
        print("-Partially correct percentage: {}%".format(dic['partially_correct_percentage']))
        print("-Totally wrong percentage: {}%".format(dic['totally_wrong_percentage']))
        print()
        pprint_entry(dic['correct_cases'][:num_cases], 'Correct cases')
        pprint_entry(dic['totally_wrong_cases'][:num_cases], 'Totally wrong cases')
        pprint_entry(dic['partially_correct_cases'][:num_cases], "Partially correct cases")
        print("------------------")

**Sample number: 0**

-Accuracy: 100.0%
-Partially correct percentage: 0.0%
-Totally wrong percentage: 0.0%

-Correct cases:
1.
---Source:     n=-1; o=-n; t=+o; e=-i; i=-m; m=+t
---Target:     n=-1;o=1;t=1;m=1;i=-1;e=1
---Prediction: n=-1;o=1;t=1;m=1;i=-1;e=1
2.
---Source:     l=-j; c=+y; q=+c; y=-1; j=+q; d=-l
---Target:     y=-1;c=-1;q=-1;j=-1;l=1;d=-1
---Prediction: y=-1;c=-1;q=-1;j=-1;l=1;d=-1
3.
---Source:     m=-z; p=+h; z=-1; e=-g; h=+e; g=+m
---Target:     z=-1;m=1;g=1;e=-1;h=-1;p=-1
---Prediction: z=-1;m=1;g=1;e=-1;h=-1;p=-1
4.
---Source:     s=+r; r=-g; h=+n; l=-h; g=-l; n=-1
---Target:     n=-1;h=-1;l=1;g=-1;r=1;s=1
---Prediction: n=-1;h=-1;l=1;g=-1;r=1;s=1
5.
---Source:     a=+k; b=-a; s=+l; k=+s; p=+b; l=-1
---Target:     l=-1;s=-1;k=-1;a=-1;b=1;p=1
---Prediction: l=-1;s=-1;k=-1;a=-1;b=1;p=1
6.
---Source:     f=-d; d=1; g=-f; w=-s; z=+w; s=+g
---Target:     d=1;f=-1;g=1;s=1;w=-1;z=-1
---Prediction: d=1;f=-1;g=1;s=1;w=-1;z=-1
7.
---Source:     e=+f; f=-a; d=-n; a=-1; z=+d; n=+e
---Target:     a=

In [21]:
exp.infer(('e=-q; r=+e; q=-k; i=+r; k=-h; h=-1', 'h=-1;k=1;q=-1;e=1;r=1;i=1'), '/mnt/task_runtime/lego-results/lego-default-setting-checkpoints/best-checkpoint-sample-num-0.ckpt')

['h=-1;k=1;q=-1;e=1;r=1;i=1']

In [25]:
exp.infer(('g=+e; j=-d; d=-m; m=+g; a=-t; t=-o; w=+n; c=-j; o=-q; n=-a; q=+c; e=1;', 'e=1;g=1;m=1;d=-1;j=1;c=-1;q=-1;o=1;t=-1;a=1;n=-1;w=-1'), '/mnt/task_runtime/lego-results/lego-default-setting-checkpoints/best-checkpoint-sample-num-0.ckpt')

['e=1;g=1;m=1;d=-1;j=1;c=-1']