# Multimodal RoBERTa Training

Used for all 3 methods:
* Text concatenation with [SEP] token.
* Text combination as sentence.
* Vector concatenation.

## Run this if you are using google colab

In [None]:
!pip install multimodal_transformers

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import packages and read data

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import torch
import re
import seaborn as sns
from torch.optim import AdamW
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from transformers import RobertaTokenizer, RobertaForSequenceClassification, get_linear_schedule_with_warmup, AutoConfig, EvalPrediction
from collections import defaultdict, Counter
from sklearn.preprocessing import LabelEncoder

from multimodal_transformers.model import TabularConfig
from multimodal_transformers.model import AutoModelWithTabular
from multimodal_transformers.data import load_data


sns.set_theme()


# Torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
# train_df = pd.read_json(path_or_buf=r'./drive/MyDrive/scicite/train.jsonl', lines=True)
# test_df = pd.read_json(path_or_buf=r'./drive/MyDrive/scicite/test.jsonl', lines=True)
# val_df = pd.read_json(path_or_buf=r'./drive/MyDrive/scicite/dev.jsonl', lines=True)

train_df = pd.read_json(path_or_buf=r'../scicite/train.jsonl', lines=True)
test_df = pd.read_json(path_or_buf=r'../scicite/test.jsonl', lines=True)
val_df = pd.read_json(path_or_buf=r'../scicite/dev.jsonl', lines=True)

## Preprocessing

In [4]:
def parse_section_name(raw_section_name):
    if raw_section_name is None or not raw_section_name.strip():
        return "unknown"
    for i, c in enumerate(raw_section_name):
        if c.isalpha():
            break
    section_name = raw_section_name[i:]
    section_name = section_name.lower().strip()

    # Map to discrete categories
    sn_mappings = {"discussion": {"discussion", "evaluation", "general discussion", "discussion and conclusions", "technical considerations",
                                  "discussion, limitations and conclusion",
                                 },
                   "introduction": {"introduction", "related work", "related works"},
                   "method": {"methods", "experiments", "methodology", "implementation", "experimental setup", "experimental design",
                              "implementation details", "experiment", "numerical experiments", "setup", "experimental settings", "experimental setting",
                              "experiment setup",
                             },
                   "results": {"results", "results and discussion", "conclusions", "experimental results", "conclusion", "results & discussion",
                               "findings",
                              },
                   "background": {"background", "present address:"},
                   "material and methods": {"materials and methods"}
                  }
    for k, sns in sn_mappings.items():
        if section_name in sns:
            return k
    return "unknown"

train_df["sectionName"] = train_df["sectionName"].apply(parse_section_name)
test_df["sectionName"] = test_df["sectionName"].apply(parse_section_name)
val_df["sectionName"] = val_df["sectionName"].apply(parse_section_name)

In [None]:
# Either bin label confidence or use it as it is
train_df["label_confidence"] = (train_df["label_confidence"].round(1) - 0.6).multiply(10).apply(lambda x: "1" if pd.isnull(x) else str(int(x)))
test_df["label_confidence"] = (test_df["label_confidence"].round(1) - 0.6).multiply(10).apply(lambda x: "1" if pd.isnull(x) else str(int(x)))
val_df["label_confidence"] = (val_df["label_confidence"].round(1) - 0.6).multiply(10).apply(lambda x: "1" if pd.isnull(x) else str(int(x)))
# train_df["label_confidence"] = train_df["label_confidence"].fillna(0.87)  # Replace nans with mean of label confidence.

In [6]:
# Label Encoder
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['label'].values)
test_df['label'] = label_encoder.transform(test_df['label'].values)
val_df['label'] = label_encoder.transform(val_df['label'].values)

In [None]:
def get_label_confidence(lc_bin):
    mapper = ("very low", "low", "medium", "high", "very high")
    return mapper[int(lc_bin)]

train_df["label_confidence"] = train_df["label_confidence"].apply(get_label_confidence)
test_df["label_confidence"] = test_df["label_confidence"].apply(get_label_confidence)
val_df["label_confidence"] = val_df["label_confidence"].apply(get_label_confidence)

In [None]:
def get_proper_source(raw_source):
    mapper = {"explicit": "explicit",
              "properNoun": "proper noun",
              "acronym": "acronym",
              "acronymParen": "acronym paren",
              "andPhrase": "and phrase",
              "etAlPhrase": "et al phrase",
              None: "unknown",
              }
    return mapper[raw_source]

train_df["source"] = train_df["source"].apply(get_proper_source)
test_df["source"] = test_df["source"].apply(get_proper_source)
val_df["source"] = val_df["source"].apply(get_proper_source)

In [None]:
# Remove citations
def remove_cites(text):
    """
    There are still citations that are not in the brackets. E.g. "In the study by Hickey et al. (2012), spikes were sampled from the field at the..."
    """
    text = re.sub("\s*\[[^\[]*\]", "", text)
    text = re.sub("\s*\([^\(]*\)", "", text)
    return text

# train_df["string"] = train_df["string"].apply(remove_cites)
# test_df["string"] = test_df["string"].apply(remove_cites)
# val_df["string"] = val_df["string"].apply(remove_cites)

In [None]:
def get_full_prompt(row):
    source = row["source"]
    section_name = row["sectionName"]
    is_key_citation = "is" if row["isKeyCitation"] else "is not"
    label_confidence = row["label_confidence"]
    text = row["string"]
    prompt = f"This text is from the source '{source}'. It {is_key_citation} a key citation and is from the section '{section_name}'. This is the text: {text}."
    return prompt


def concat_SEP(row, sep_tok="</s>"):
    source = row["source"]
    section_name = row["sectionName"]
    is_key_citation = str(row["isKeyCitation"])
    label_confidence = row["label_confidence"]
    text = row["string"]
    prompt = f"{source} {sep_tok} {is_key_citation} {sep_tok} {section_name} {sep_tok} {label_confidence} {sep_tok} {text}"
    return prompt


train_df["prompt"] = train_df.apply(concat_SEP, axis=1)
test_df["prompt"] = test_df.apply(concat_SEP, axis=1)
val_df["prompt"] = val_df.apply(concat_SEP, axis=1)

# train_df["prompt"] = train_df.apply(get_full_prompt, axis=1)
# test_df["prompt"] = test_df.apply(get_full_prompt, axis=1)
# val_df["prompt"] = val_df.apply(get_full_prompt, axis=1)

## Setup

In [None]:
text_cols = ['prompt']
# The label col is expected to contain integers from 0 to N_classes - 1
label_col = 'label'
categorical_cols = ['source', 'sectionName', 'isKeyCitation']
numerical_cols = ['label_confidence']
label_list = ['background', 'method', 'result'] # what each label class represents

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [8]:
train_ds = load_data(
    train_df,
    text_cols,
    tokenizer,
    categorical_cols=categorical_cols,
    numerical_cols=numerical_cols,
    label_col=label_col,
    sep_text_token_str=tokenizer.sep_token
)

val_ds = load_data(
    val_df,
    text_cols,
    tokenizer,
    categorical_cols=categorical_cols,
    numerical_cols=numerical_cols,
    label_col=label_col,
    sep_text_token_str=tokenizer.sep_token
)

test_ds = load_data(
    test_df,
    text_cols,
    tokenizer,
    categorical_cols=categorical_cols,
    numerical_cols=numerical_cols,
    label_col=label_col,
    sep_text_token_str=tokenizer.sep_token
)

batch_size = 8
train_dataloader = torch.utils.data.DataLoader(train_ds,
                                               sampler=torch.utils.data.RandomSampler(train_ds),
                                               batch_size=batch_size)
test_dataloader = torch.utils.data.DataLoader(test_ds,
                                              sampler=torch.utils.data.RandomSampler(test_ds),
                                              batch_size=batch_size)
val_dataloader = torch.utils.data.DataLoader(val_ds,
                                             sampler=torch.utils.data.RandomSampler(val_ds),
                                             batch_size=batch_size)

In [9]:
num_labels = 3
# config = AutoConfig.from_pretrained('bert-base-uncased')
config = AutoConfig.from_pretrained('roberta-base')
tabular_config = TabularConfig(
    num_labels=num_labels,
    cat_feat_dim=train_ds.cat_feats.shape[1],
    numerical_feat_dim=train_ds.numerical_feats.shape[1],
    combine_feat_method="text_only"  # "concat"
)
config.tabular_config = tabular_config

## Train

In [None]:
def evaluate(model, val_dataloader, val_size):
    model.eval()
    with torch.no_grad():
        val_loss = 0
        y_pred = []
        y_true = []
        for batch in val_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            # batch["labels"] = batch["labels"].type(torch.LongTensor).to(device)
            labels = batch["labels"]

            loss, logits, _ = model(**batch)
            loss = torch.nn.CrossEntropyLoss()(logits, labels)

            val_loss += loss.item()

            y_pred.append(torch.max(logits, dim=-1)[1].detach().cpu().numpy())
            y_true.append(labels.detach().cpu().numpy())
        # Store train and validation loss history
        y_pred = np.concatenate(y_pred)
        y_true = np.concatenate(y_true)
        val_loss = val_loss / val_size
        val_f1 = f1_score(y_true, y_pred, average="macro")
    model.train()
    return val_loss, val_f1


def train(model,
          optimizer,
          train_dataloader,
          val_dataloader,
          scheduler = None,
          num_epochs = 5,
         ):

    # Initialize losses and loss histories
    train_loss = 0

    train_loss_list = []
    val_loss_list = []
    train_f1_list = []
    val_f1_list = []

    best_val_f1 = 0

    train_size = len(train_dataloader)
    val_size = len(val_dataloader)

    model.train()

    # Train loop
    for epoch in range(num_epochs):
        y_pred = []
        y_true = []
        test = True
        for batch in tqdm(train_dataloader):
            torch.cuda.empty_cache()
            batch = {k: v.to(device) for k, v in batch.items()}
            # batch["labels"] = batch["labels"].type(torch.LongTensor).to(device)
            labels = batch["labels"]

            loss, logits, _ = model(**batch)

            loss = torch.nn.CrossEntropyLoss()(logits, labels)

            loss.backward()

            # Optimizer and scheduler step
            optimizer.step()
            if scheduler:
                scheduler.step()

            optimizer.zero_grad()

            train_loss += loss.item()

            a = torch.max(logits, dim=-1)[1].detach().cpu().numpy()
            b = labels.detach().cpu().numpy()
            y_pred.append(a)
            y_true.append(b)
            if test:
                test = False
                print("Samples: ")
                print("Pred: ", a[:20])
                print("Labels: ", b[:20])
                print()

        y_pred = np.concatenate(y_pred)
        y_true = np.concatenate(y_true)
        train_loss = train_loss / train_size
        train_loss_list.append(train_loss)
        train_f1 = f1_score(y_true, y_pred, average="macro")
        train_f1_list.append(train_f1)

        # Validation
        val_loss, val_f1 = evaluate(model, val_dataloader, val_size)
        val_loss_list.append(val_loss)
        val_f1_list.append(val_f1)

        # Validation
        test_loss, test_f1 = evaluate(model, test_dataloader, val_size)

        # Print summary
        print(f"Epoch {epoch}:")
        print(f"Train loss: {train_loss:.4f}, Validation loss: {val_loss:.4f}, Test loss: {test_loss:.4f}")
        print(f"Train Macro F1: {train_f1:.4f}, Validation Macro F1: {val_f1:.4f}, Test Macro F1: {test_f1:.4f}")

        # checkpoint
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            torch.save(model, './drive/MyDrive/multimodal_roberta_text_only_best.pt')
            print(f"Model saved at epoch {epoch}.")

        train_loss = 0

        model.train()

    print('Training done!')


# model = AutoModelWithTabular.from_pretrained('bert-base-uncased', config=config).to(device)
model = AutoModelWithTabular.from_pretrained('roberta-base', config=config).to(device)

NUM_EPOCHS = 10 #15
print("======================= Start training =================================")
optimizer = AdamW(model.parameters(), lr = 1e-6, betas=(0.9, 0.98), eps=1e-6)
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_dataloader) * NUM_EPOCHS)
torch.cuda.empty_cache()
train(model=model,
      train_dataloader=train_dataloader,
      val_dataloader=val_dataloader,
      optimizer=optimizer,
      scheduler=scheduler,
      num_epochs=NUM_EPOCHS)