In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
QPATH = "Quantlet/Domain_PreTraining"
MODE = 'test'

import sys
IN_COLAB = 'google.colab' in sys.modules

import os
if IN_COLAB:
  os.chdir(f'/content/drive/MyDrive/ColabNotebooks/IRTG/Encode_the_Qode/Encode-the-Qode/{QPATH}')

sys.path.append('../src')

In [None]:
#%pip install protobuf==3.20.1
%pip install transformers[torch]
%pip install -q sentencepiece
%pip install datasets==2.13.1
%pip install evaluate
%pip install rouge_score
#%pip install wandb
#%pip install git+https://github.com/huggingface/nlp.git@fix-bad-type-in-overflow-check

In [None]:
import numpy as np
import pandas as pd
import re
import os
import sys
import json
import ast
import pickle
import random

sys.path.append('../../Quantlet/Domain_PreTraining/')
from abstracts import abstracts

import importlib


#import preprocessing_utils
#importlib.reload(preprocessing_utils)

import torch
import numpy as np
import datasets

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)

import nltk
from datetime import datetime

import evaluate
nltk.download("punkt", quiet=True)


from datasets import Dataset
from datasets import load_dataset


import evaluate
metric = evaluate.load("rouge")

from sklearn.model_selection import train_test_split, KFold

os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"]="0"


RS = 42

In [None]:
torch.manual_seed(RS)
random.seed(RS)
np.random.seed(RS)
torch.use_deterministic_algorithms(True)
os.environ["CUBLAS_WORKSPACE_CONFIG"]=':16:8'

In [None]:
abstract_pattern = re.compile(r'(?<=Abstract)(.+)(?=Keywords)', re.DOTALL)
keywords_pattern = re.compile(r'(?<=Keywords)(.+)', re.DOTALL)

In [None]:
papers = []
for paper in abstracts.split('IRTG1792DP'):
    paper_dict = {}
    keywords = re.findall(keywords_pattern, paper)
    if len(keywords) > 0:
       keywords = keywords[0]
       keywords = keywords.split('JEL Classification')[0]
       paper_dict['keywords'] = keywords

    abstract = re.findall(abstract_pattern, paper)
    if len(abstract) > 0:
       abstract = abstract[0]
       paper_dict['abstract'] = abstract

    if ('abstract' in list(paper_dict.keys())) & ('keywords' in list(paper_dict.keys())):
      papers.append(paper_dict)

In [None]:
EVAL_COLUMNS = ['eval_loss',
                'eval_rouge1',
                'eval_rouge2',
                'eval_rougeL',
                'eval_rougeLsum',
                'eval_bleu',
                'eval_gen_len']

In [None]:
test_idx  = random.sample(range(len(papers)), k=int(0.3*len(papers)))
train_idx = list(set(range(len(papers))).difference(test_idx))

In [None]:
train_dataset_json = {'version' : '0.1.0',
                     'data' : [{'description': papers[i]['abstract'],
                                'summary' : papers[i]['keywords']} for i in train_idx]}

test_dataset_json = {'version' : '0.1.0',
                     'data' : [{'description': papers[i]['abstract'],
                                'summary' : papers[i]['keywords']} for i in test_idx]}



with open('../../data/preprocessed/Quantlet/domain_pretrain_train.json', 'w') as f:
    json.dump(train_dataset_json, f)

with open('../../data/preprocessed/Quantlet/domain_pretrain_test.json', 'w') as f:
    json.dump(test_dataset_json, f)

In [None]:
train_dataset = load_dataset("json", data_files="../../data/preprocessed/Quantlet/domain_pretrain_train.json", field="data")['train']
test_dataset = load_dataset("json", data_files="../../data/preprocessed/Quantlet/domain_pretrain_test.json", field="data")['train']

In [None]:
model_name = "sshleifer/distilbart-xsum-12-3"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# tokenization
encoder_max_length = 512  # demo
decoder_max_length = 10

In [None]:
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["description"], batch["summary"]
    source_tokenized = tokenizer(
        source, padding="max_length", truncation=True, max_length=max_source_length
    )
    target_tokenized = tokenizer(
        target, padding="max_length", truncation=True, max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch


train_data = train_dataset.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=train_dataset.column_names,
)

test_data = test_dataset.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=test_dataset.column_names,
)

In [None]:
def postprocess_text(preds, labels):

    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

In [None]:
def compute_metrics(eval_preds, metrics_list=['rouge', 'bleu']):

    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # POST PROCESSING
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    results_dict = {}
    for m in metrics_list:
        metric = evaluate.load(m)

        if m=='bleu':
            result = metric.compute(
              predictions=decoded_preds, references=decoded_labels
           )
        elif m=='rouge':
            result = metric.compute(
                predictions=decoded_preds, references=decoded_labels, use_stemmer=True
            )
        result = {key: value for key, value in result.items() if key!='precisions'}

        prediction_lens = [
            np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
        ]
        result["gen_len"] = np.mean(prediction_lens)
        result = {k: round(v, 4) for k, v in result.items()}
        results_dict.update(result)
    return results_dict

In [None]:
if MODE=='test':
  eval_data = test_data

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="results",
    num_train_epochs=5,  # demo
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,  # demo
    per_device_eval_batch_size=16,
    # learning_rate=3e-05,
    warmup_steps=500,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    logging_dir="logs",
    logging_steps=100,
    report_to=None,
    seed=RS,
    save_total_limit = 1,
    load_best_model_at_end= True,
    evaluation_strategy='epoch',
    save_strategy='epoch',
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=eval_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
results_zero_shot = trainer.evaluate()

results_zero_shot_df = pd.DataFrame(data=results_zero_shot, index=[0])[EVAL_COLUMNS]
results_zero_shot_df.loc[0, :] = results_zero_shot_df.loc[0, :].apply(lambda x: round(x, 3))
display(results_zero_shot_df)

In [None]:
trainer.train()

In [None]:
results_fine_tune = trainer.evaluate()

results_fine_tune_df = pd.DataFrame(data=results_fine_tune, index=[0])[EVAL_COLUMNS]

results_fine_tune_df.loc[0, :] = results_fine_tune_df.loc[0, :].apply(lambda x: round(x, 3))


display(results_fine_tune_df)

In [None]:
best_ckpt_path = trainer.state.best_model_checkpoint

In [None]:
best_ckpt_path

In [None]:
def generate_summary(test_samples, model):
    inputs = tokenizer(
        test_samples["description"],
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str


model_before_tuning = AutoModelForSeq2SeqLM.from_pretrained(model_name)

test_samples = test_dataset.select(range(20))

summaries_before_tuning = generate_summary(test_samples, model_before_tuning)[1]
summaries_after_tuning = generate_summary(test_samples, model)[1]

In [None]:
for i, description in enumerate(test_samples["summary"]):
  print('_'*10)
  print(f'Original: {description}')
  print(f'Summary before Tuning: {summaries_before_tuning[i]}')
  print(f'Summary after Tuning: {summaries_after_tuning[i]}')
  print('_'*10)
  print('\n')