# Import

In [1]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from torch.optim import AdamW

show_statistics = False

# hyper parameters
epochs = 12
learning_rate = 2e-5
batch_size = 4

# summarize, -1-, summary

## Create Dataset and Dataloader

In [2]:
tokenizer = T5Tokenizer.from_pretrained("kiri-ai/t5-base-qa-summary-emotion")

class JsonlDataset(Dataset):
    def __init__(self, filename, is_test=False):
        self.data = [json.loads(line) for line in open(filename, 'r', encoding='utf-8')]
        self.is_test = is_test

    def __len__(self):
        return len(self.data)
    
    def getMerged(self, question, options, article):
        question = question.replace('@placeholder', '_1_')
        options_str = ', '.join(f'{options[i]}' for i in range(5))     
        return f'summarize and cloze: Summary<{question}>\nGiven Options<{options_str}>\nAnd Article<{article}>'
    
    def getLabel(self, question, options, label):
        answer = f"{options[label]}"
        return question.replace('@placeholder', answer)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        article = item['article']
        question = item['question']
        options = [item[f'option_{i}'] for i in range(5)]
        
        merged = self.getMerged(question, options, article)
        
        in_tokenize = tokenizer(
            merged,
            add_special_tokens=True,
            max_length=512,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors="pt"
        )
        
        input_ids = in_tokenize['input_ids'].flatten()
        attention_mask = in_tokenize["attention_mask"].flatten()
        
        if self.is_test:
            return {
                'article': article,
                'question': question,
                'options': options,
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'merged': merged
            }
        else:
            out_tokenize = tokenizer(
                self.getLabel(question, options, item['label']),
                add_special_tokens=True,
                max_length=150,
                return_tensors="pt",
                padding='max_length'
            )
            
            label = out_tokenize['input_ids'].flatten()
            
            return {
                'article': article,
                'question': question,
                'options': options,
                'label': label,
                'input_ids': input_ids,
                'attention_mask': attention_mask, 
                'label': label,
                'merged': merged
            }

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
task1_data_path = {
    'train_data_path': '../input/semevaldataset/training_data/Task_1_train.jsonl',
    'dev_data_path': '../input/semevaldataset/training_data/Task_1_dev.jsonl',
    'test_data_path': '../input/semevaldataset/trail_data/Task_1_Imperceptibility.jsonl'
}

task2_data_path = {
    'train_data_path': '../input/semevaldataset/training_data/Task_2_train.jsonl',
    'dev_data_path': '../input/semevaldataset/training_data/Task_2_dev.jsonl',
    'test_data_path': '../input/semevaldataset/trail_data/Task_2_Nonspecificity.jsonl'
}

# Task 1
task1_train_dataset = JsonlDataset(task1_data_path['train_data_path'])
task1_train_loader = DataLoader(task1_train_dataset, batch_size=batch_size, shuffle=True)

task1_dev_dataset = JsonlDataset(task1_data_path['dev_data_path'])
task1_dev_loader = DataLoader(task1_dev_dataset, batch_size=batch_size, shuffle=True)

# task1_test_dataset = JsonlDataset(task1_data_path['test_data_path'])
# task1_test_loader = DataLoader(task1_test_dataset, batch_size=batch_size, shuffle=True)

# Task 2
task2_train_dataset = JsonlDataset(task2_data_path['train_data_path'])
task2_train_loader = DataLoader(task2_train_dataset, batch_size=batch_size, shuffle=True)

task2_dev_dataset = JsonlDataset(task2_data_path['dev_data_path'])
task2_dev_loader = DataLoader(task2_dev_dataset, batch_size=batch_size, shuffle=True)

## Check token length distribution

In [4]:
import matplotlib.pyplot as plt

def show_token_length_statistics(dataloader, tokenizer):
    token_length = []
    str_length = []

    for i, batch in enumerate(dataloader):
        if i == 0:
            print(batch['merged'][0])
                
        for article in batch['article']:
            str_length.append(len(article))
            tokenize = tokenizer(
                article,
                add_special_tokens=True,
                max_length=4096,
                truncation=True,
                return_attention_mask=True,
                return_tensors="pt"
            )
            token_length.append(tokenize['input_ids'].shape[1])

    if show_statistics:
        plt.hist(token_length, bins=50)
        plt.xlabel('Token length')
        plt.ylabel('Frequency')
        plt.title('Token Length Distribution')
        plt.show()

        print('number of instances: ', len(dataloader.dataset))
        print('avg token length: ', sum(token_length) / len(token_length))

In [5]:
if show_statistics:
    show_token_length_statistics(task1_train_loader, tokenizer)

In [6]:
if show_statistics:
    show_token_length_statistics(task1_dev_loader, tokenizer)

In [7]:
if show_statistics:
    show_token_length_statistics(task1_test_loader, tokenizer)

In [8]:
if show_statistics:
    show_token_length_statistics(task2_train_loader, tokenizer)

In [9]:
if show_statistics:
    show_token_length_statistics(task2_dev_loader, tokenizer)

In [10]:
if show_statistics:
    show_token_length_statistics(task2_test_loader, tokenizer)

## fine tuning

In [11]:
def train(model, train_loader, dev_loader, output_path):
    best_val_loss = float('inf')
    
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    model = model.to(device)
    
    for epoch in range(epochs):
        total_loss = 0

        model.train()
        for step, batch in enumerate(train_loader):
            optimizer.zero_grad()

            # progress update after every 100 batches.
            if step % 100 == 0 and not step == 0:
                print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_loader)))

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            label = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=label)

            loss = outputs.loss
            loss.backward()

            optimizer.step()
            # scheduler.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        print(f'Training loss: {avg_train_loss}')

        model.eval()
        total_val_loss = 0
        for batch in dev_loader:
            with torch.no_grad():
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                label = batch['label'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask, labels=label)
                loss = outputs.loss
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(dev_loader)
        print(f'Validation loss: {avg_val_loss}')

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), output_path)
            print(f'{output_path} saved.')
            
#             with open('../working/best_val_loss.txt', "w") as file:
#             file.write(str(best_val_loss))
#             print('best_val_loss saved.')

In [12]:
# train on task1
model = T5ForConditionalGeneration.from_pretrained("kiri-ai/t5-base-qa-summary-emotion")
train(model, task1_train_loader, task1_dev_loader, '../working/task1_1_summary_best.bin')

# train on task2
model = T5ForConditionalGeneration.from_pretrained("kiri-ai/t5-base-qa-summary-emotion")
train(model, task2_train_loader, task2_dev_loader, '../working/task2_1_summary_best.bin')

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

  Batch   100  of    807.
  Batch   200  of    807.
  Batch   300  of    807.
  Batch   400  of    807.
  Batch   500  of    807.
  Batch   600  of    807.
  Batch   700  of    807.
  Batch   800  of    807.
Training loss: 0.17594331164083812
Validation loss: 0.06088148872589781
../working/task1_1_summary_best.bin saved.
  Batch   100  of    807.
  Batch   200  of    807.
  Batch   300  of    807.
  Batch   400  of    807.
  Batch   500  of    807.
  Batch   600  of    807.
  Batch   700  of    807.
  Batch   800  of    807.
Training loss: 0.05222276082121602
Validation loss: 0.02471574614534066
../working/task1_1_summary_best.bin saved.
  Batch   100  of    807.
  Batch   200  of    807.
  Batch   300  of    807.
  Batch   400  of    807.
  Batch   500  of    807.
  Batch   600  of    807.
  Batch   700  of    807.
  Batch   800  of    807.
Training loss: 0.030927273364624534
Validation loss: 0.018915974674746393
../working/task1_1_summary_best.bin saved.
  Batch   100  of    807.
  B

# cloze, @placehodler, i-option[i]

In [13]:
class JsonlDataset2(JsonlDataset):
    def __init__(self, filename, is_test=False):
        super().__init__(filename, is_test)

    def getMerged(self, question, options, article):
        options_str = ', '.join(f'{i}-{options[i]}' for i in range(5))     
        return f'cloze: Summary<{question}>\nGiven Options<{options_str}>\nAnd Article<{article}>'
        
    def getLabel(self, question, options, label):
        label = f"{label}-{options[label]}"
        return label

In [14]:
# Task 1
task1_train_dataset = JsonlDataset2(task1_data_path['train_data_path'])
task1_train_loader = DataLoader(task1_train_dataset, batch_size=batch_size, shuffle=True)

task1_dev_dataset = JsonlDataset2(task1_data_path['dev_data_path'])
task1_dev_loader = DataLoader(task1_dev_dataset, batch_size=batch_size, shuffle=True)

# Task 2
task2_train_dataset = JsonlDataset2(task2_data_path['train_data_path'])
task2_train_loader = DataLoader(task2_train_dataset, batch_size=batch_size, shuffle=True)

task2_dev_dataset = JsonlDataset2(task2_data_path['dev_data_path'])
task2_dev_loader = DataLoader(task2_dev_dataset, batch_size=batch_size, shuffle=True)

In [15]:
# train on task1
model = T5ForConditionalGeneration.from_pretrained("kiri-ai/t5-base-qa-summary-emotion")
train(model, task1_train_loader, task1_dev_loader, '../working/task1_cloze_option_best.bin')

# train on task2
model = T5ForConditionalGeneration.from_pretrained("kiri-ai/t5-base-qa-summary-emotion")
train(model, task2_train_loader, task2_dev_loader, '../working/task2_cloze_option_best.bin')

  Batch   100  of    807.
  Batch   200  of    807.
  Batch   300  of    807.
  Batch   400  of    807.
  Batch   500  of    807.
  Batch   600  of    807.
  Batch   700  of    807.
  Batch   800  of    807.
Training loss: 0.32514502745135565
Validation loss: 0.018206581624136086
../working/task1_cloze_option_best.bin saved.
  Batch   100  of    807.
  Batch   200  of    807.
  Batch   300  of    807.
  Batch   400  of    807.
  Batch   500  of    807.
  Batch   600  of    807.
  Batch   700  of    807.
  Batch   800  of    807.
Training loss: 0.023390606635445187
Validation loss: 0.012549221910358895
../working/task1_cloze_option_best.bin saved.
  Batch   100  of    807.
  Batch   200  of    807.
  Batch   300  of    807.
  Batch   400  of    807.
  Batch   500  of    807.
  Batch   600  of    807.
  Batch   700  of    807.
  Batch   800  of    807.
Training loss: 0.014869195189848635
Validation loss: 0.01149981389975264
../working/task1_cloze_option_best.bin saved.
  Batch   100  of 