In [None]:
# Requirements
# Comment if already satisfied
#!pip install -U jupyter
#!pip install datasets transformers accelerate torch torchinfo xlrd seaborn sklearn torchmetrics
#!pip install --upgrade --quiet jupyter_client ipywidgets
#!jupyter nbextension enable --py widgetsnbextension

In [None]:
import os
import pandas as pd

DATA_DIR = "../data/"
LM_DIR = "./out/language_model"
OUT_DIR = "./out/classification_model"

os.makedirs(OUT_DIR, exist_ok=True)

SEED = 42

In [None]:
import matplotlib.pyplot as plt
def plot_training_history(train_history, val_history, model_name, phase_name, out_dir):
    fig, (loss_ax, acc_ax) = plt.subplots(figsize=(12, 8), nrows=2)
    fig.suptitle(f"{model_name} - {phase_name} History")
    loss_ax.set_title("Cross Entropy Loss")
    loss_ax.plot(train_history["loss"], label="train")
    loss_ax.plot(val_history["loss"], label="val")
    loss_ax.legend(loc="best")

    acc_ax.set_title("Classification accuracy")
    acc_ax.plot(train_history["accuracy"], label="train")
    acc_ax.plot(val_history["accuracy"], label="val")
    loss_ax.legend(loc="best")

    plt.tight_layout()

    fig.savefig(f"{out_dir}/{model_name}_{phase_name}_history.png", dpi=150, bbox_inches='tight')

    return fig

# Load Dataset

In [None]:
#articles = pd.read_excel(os.path.join(DATA_DIR, "OpArticles_ADUs.xlsx"))
adus = pd.read_csv(os.path.join(DATA_DIR, "OpArticles_ADUs.csv")) # If pandas version doesn't support .xlsx, use this instaed

adus.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

print("Label Ratios")
print(adus["label"].value_counts() / len(adus["label"]))

fig, ax = plt.subplots()
sns.countplot(data=adus, x="label", ax=ax)
ax.set_title("Distribution of labels on ADUs")
ax.set_xlabel("Label")
ax.set_ylabel("Count")
fig.savefig(f"{OUT_DIR}/label_distribution_full.png", dpi=150, bbox_inches='tight')
plt.show()
plt.clf()

Encode Labels

In [None]:
classes = ["Fact", "Policy", "Value", "Value(+)", "Value(-)"]
N_CLASSES = len(classes)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
adus["label"] = le.fit_transform(adus["label"])

adus.head()

# Create Train, Validation and Test splits

In [None]:
# Maintain the ratios of labels across splits
train_val_adus = adus.groupby("label").sample(frac=0.8, random_state=SEED)
test_adus = adus.loc[adus.index.difference(train_val_adus.index)]

train_adus = train_val_adus.groupby("label").sample(frac=0.8, random_state=SEED)
val_adus = train_val_adus.loc[train_val_adus.index.difference(train_adus.index)]

fig, (train_ax, val_ax, test_ax) = plt.subplots(figsize=(16, 4), ncols=3)
for split, df, ax in zip(["Train", "Validation", "Test"], [train_adus, val_adus, test_adus], [train_ax, val_ax, test_ax]):
    df_labels = le.inverse_transform(df["label"])
    sns.countplot(x=df_labels, ax=ax, order=["Value", "Fact", "Value(-)", "Value(+)", "Policy"])
    ax.set_title(f"Distribution of labels on {split} ADUs")
    ax.set_xlabel("Label")
    ax.set_ylabel("Count")
    
    print(f"{split} Label Ratios")
    print(pd.Series(df_labels).value_counts() / len(df_labels))
fig.savefig(f"{OUT_DIR}/label_distribution_splits.png", dpi=150, bbox_inches='tight')
plt.show()
plt.clf()

In [None]:
train_adus.reset_index(inplace=True)
val_adus.reset_index(inplace=True)
test_adus.reset_index(inplace=True)

train_adus

# Convert Dataset to HF Dataset

In [None]:
from datasets.dataset_dict import DatasetDict
from datasets import Dataset, DatasetInfo, Features, Value, ClassLabel
info = DatasetInfo(
    features=Features({
        "index": Value("int64"),
        "tokens": Value("string"),
        "label": ClassLabel(num_classes=N_CLASSES, names=le.classes_.tolist()),
        "article_id": Value("string"),
        "annotator": Value("string"),
        "node": Value("int64"),
        "ranges": Value("string"),
    })
)

adus_dataset = DatasetDict({
    "train": Dataset.from_pandas(train_adus, preserve_index=True, info=info),
    "val": Dataset.from_pandas(val_adus, preserve_index=True, info=info),
    "test": Dataset.from_pandas(test_adus, preserve_index=True, info=info)
})

print(adus_dataset)

In [None]:
adus_dataset["train"].features

Remove unnecessary columns

In [None]:
adus_dataset = adus_dataset.remove_columns(["annotator", "node", "ranges"])
print(adus_dataset)

Base Tokenizer

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-large-portuguese-cased', do_lower_case=True)

In [None]:
def tokenize(batch):
    result = tokenizer(batch["tokens"])
    return result

In [None]:
cls_dataset = adus_dataset.map(
    tokenize, batched=True, remove_columns=["article_id", "tokens"]
)

print(cls_dataset)

# Define Dataloaders

In [None]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

BATCH_SIZE = 64 # Tested with NVIDIA Tesla T4 16GB
train_dataloader = DataLoader(
    dataset=cls_dataset["train"],
    shuffle=True,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator
)

val_dataloader = DataLoader(
    dataset=cls_dataset["val"],
    shuffle=False,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator
)

test_dataloader = DataLoader(
    dataset=cls_dataset["test"],
    shuffle=False,
    batch_size=1,
    collate_fn=data_collator
)

# Model Definition

In [None]:
from transformers import AutoModelForSequenceClassification
from torchinfo import summary

model = AutoModelForSequenceClassification.from_pretrained(f"{LM_DIR}/best", num_labels=N_CLASSES)

summary(model)

# Define Optimizer

In [None]:
from torch import optim

optimizer = optim.Adam(
    params=model.parameters(),
    lr=5e-3,
    betas=(0.9, 0.999),
    weight_decay=1e-4,
    amsgrad=True
)

# Prepare Classification Model

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()

model, optimizer, train_dataloader, val_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, val_dataloader
)

# Define Epoch Iteration

In [None]:
import json
synonyms = {}

with open(DATA_DIR + "/synonyms.json", "r", encoding="utf-8") as synonyms_json:
    synonyms = json.load(synonyms_json)

synonyms_tokenized = {}
n = 0
for k, v in synonyms.items():
    tokens_k = tokenizer.encode(k, add_special_tokens=False)

    # Don't allow multi token for simplicity
    if len(tokens_k) > 1:
        continue

    token_v = [tokenizer.encode(s, add_special_tokens=False) for s in v]
    token_v = [tv[0] for tv in token_v if len(tv) == 1]

    if token_v:
        synonyms_tokenized[tokens_k[0]] = token_v

    
def add_synonyms(input_ids, p = 0.05):
    for i in range(len(input_ids)):
        inputId = input_ids[i].item()
        if (inputId in synonyms_tokenized) and np.random.binomial(1, p):
            input_ids[i] = np.random.choice(synonyms_tokenized[inputId])

In [None]:
import torch
import torch.nn.functional as F
from tqdm import tqdm
import numpy as np

def epoch_iter(dataloader, model, is_train = True, optimizer=None, lr_scheduler=None):
    if is_train:
        assert optimizer is not None, "When training, please provide an optimizer."
      
    num_batches = len(dataloader)

    if is_train:
        model.train()
    else:
        model.eval()

    total_loss = 0.0
    probs = []
    preds = []
    expected_labels = []
    aduIds = []
    
    scaler = torch.cuda.amp.GradScaler()

    with torch.set_grad_enabled(is_train):
        for batch in tqdm(dataloader):
            index = batch.pop("index")

            if is_train:
                for b_input_ids in batch["input_ids"]:
                    add_synonyms(b_input_ids, p=0.3)
            
            with torch.cuda.amp.autocast():
                outputs = model(**batch)
            loss = outputs.loss

            if is_train:
                optimizer.zero_grad()
                accelerator.backward(scaler.scale(loss))
                scaler.step(optimizer)
                scaler.update()
            
            total_loss += loss.item()

            prob = F.softmax(outputs.logits, dim=1)
            final_pred = torch.argmax(prob, dim=1)

            aduIds.extend(index.detach().cpu().numpy())
            probs.extend(prob.detach().cpu().numpy())
            preds.extend(final_pred.detach().cpu().numpy())
            expected_labels.extend(batch["labels"].detach().cpu().numpy())
            
        if is_train and lr_scheduler is not None:
            lr_scheduler.step()
        
    return (expected_labels, preds, probs, aduIds), total_loss / num_batches
            

# Train Classification Model

In [None]:
from torch import optim
import torchmetrics

NUM_EPOCHS = 30

lr_scheduler = optim.lr_scheduler.ExponentialLR(
    optimizer=optimizer,
    gamma=0.9,
    verbose=True
)

metric_scorer = torchmetrics.Accuracy(
    threshold=0.5,
    num_classes=N_CLASSES,
    average="micro",
)

In [None]:
train_history = {
    "loss": [],
    "accuracy": []
}

val_history = {
    "loss": [],
    "accuracy": []
}

model.bert.requires_grad_(False) # Freeze language model layer

best_loss = np.inf
best_accuracy = 0
best_epoch = -1

print(f"Starting classification language model training...")

for epoch in range(1, NUM_EPOCHS + 1):
    print(f"Epoch[{epoch}/{NUM_EPOCHS}]")
    
    (train_target, train_preds, train_probs, _), train_loss = epoch_iter(
        dataloader=train_dataloader,
        model=model,
        is_train=True,
        optimizer=optimizer,
        lr_scheduler=lr_scheduler
    )
    
    train_accuracy = metric_scorer(torch.tensor(np.array(train_probs)), torch.tensor(np.array(train_target))).item()
    print(f"Training loss: {train_loss:.3f}\t Training micro accuracy: {train_accuracy:.3f}")

    (val_target, val_preds, val_probs, _), val_loss = epoch_iter(
        dataloader=val_dataloader,
        model=model,
        is_train=False,
    )

    val_accuracy = metric_scorer(torch.tensor(np.array(val_probs)), torch.tensor(np.array(val_target))).item()
    print(f"Validation loss: {val_loss:.3f}\t Validation micro accuracy: {val_accuracy:.3f}")
    
    # Save best model
    if val_loss < best_loss:
        best_loss = val_loss
        best_accuracy = val_accuracy
        best_epoch = epoch
        
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(f"{OUT_DIR}/best", save_function=accelerator.save)

    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(f"{OUT_DIR}/latest", save_function=accelerator.save)

    train_history["loss"].append(train_loss)
    train_history["accuracy"].append(train_accuracy)

    val_history["loss"].append(val_loss)
    val_history["accuracy"].append(val_accuracy)

    print("----------------------------------------------------------------")
    
print(
    f"\nFinished training..."
    f"\nBest epoch: {best_epoch}\t Validation loss on best epoch: {best_loss}\t Accuracy on best epoch: {best_accuracy}"
)

In [None]:
fig = plot_training_history(train_history=train_history, val_history=val_history, model_name="BERT-ADU-CLS", phase_name="FREEZE", out_dir=OUT_DIR)
fig.show()

Clear GPU memory for guarantees

In [None]:
import gc
model = None
train_dataloader = None
val_dataloader = None
lr_scheduler = None
optimizer = None

gc.collect()
torch.cuda.empty_cache()
torch.cuda.mem_get_info("cuda:0")

In [None]:
!nvidia-smi

# Fine-tune model

Load best model from first training session

In [None]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

BATCH_SIZE = 16 # Tested with NVIDIA Tesla T4 16GB

train_dataloader = DataLoader(
    dataset=cls_dataset["train"],
    shuffle=True,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator
)

val_dataloader = DataLoader(
    dataset=cls_dataset["val"],
    shuffle=False,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(f"{OUT_DIR}/best", num_labels=N_CLASSES)

print("Loaded best model...")

summary(model)

In [None]:
ft_optimizer = optim.Adam(
    params=model.parameters(),
    lr=1e-5,
    betas=(0.9, 0.999),
    weight_decay=5e-4,
    amsgrad=True
)

accelerator = Accelerator()

model, ft_optimizer, train_dataloader, val_dataloader = accelerator.prepare(
    model, ft_optimizer, train_dataloader, val_dataloader
)

ft_lr_scheduler = optim.lr_scheduler.ExponentialLR(
    optimizer=ft_optimizer,
    gamma=0.9,
    verbose=True
)

In [None]:
ft_train_history = {
    "loss": [],
    "accuracy": []
}

ft_val_history = {
    "loss": [],
    "accuracy": []
}

FT_NUM_EPOCHS = 15
model.bert.requires_grad_(True) # Unfreeze language model layer

ft_best_loss = best_loss 
ft_best_accuracy = best_accuracy
ft_best_epoch = -1

print(f"Starting classification language model fine-tuning...")

for epoch in range(1, FT_NUM_EPOCHS + 1):
    print(f"Epoch[{epoch}/{FT_NUM_EPOCHS}]")
    
    (train_target, train_preds, train_probs, _), train_loss = epoch_iter(
        dataloader=train_dataloader,
        model=model,
        is_train=True,
        optimizer=ft_optimizer,
        lr_scheduler=ft_lr_scheduler
    )
    
    train_accuracy = metric_scorer(torch.tensor(np.array(train_probs)), torch.tensor(np.array(train_target))).item()
    print(f"Training loss: {train_loss:.3f}\t Training micro accuracy: {train_accuracy:.3f}")

    (val_target, val_preds, val_probs, _), val_loss = epoch_iter(
        dataloader=val_dataloader,
        model=model,
        is_train=False,
    )

    val_accuracy = metric_scorer(torch.tensor(np.array(val_probs)), torch.tensor(np.array(val_target))).item()
    print(f"Validation loss: {val_loss:.3f}\t Validation micro accuracy: {val_accuracy:.3f}")
    
    # Save best model
    if val_loss < ft_best_loss:
        ft_best_loss = val_loss
        ft_best_accuracy = val_accuracy
        ft_best_epoch = epoch
        
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(f"{OUT_DIR}/fine_tuned/best", save_function=accelerator.save)

    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(f"{OUT_DIR}/fine_tuned/latest", save_function=accelerator.save)

    ft_train_history["loss"].append(train_loss)
    ft_train_history["accuracy"].append(train_accuracy)

    ft_val_history["loss"].append(val_loss)
    ft_val_history["accuracy"].append(val_accuracy)

    print("----------------------------------------------------------------")
    
print(
    f"\nFinished fine-tuning..."
    f"\nBest epoch: {ft_best_epoch}\t Validation loss on best epoch: {ft_best_loss}\t Accuracy on best epoch: {ft_best_accuracy}"
)

In [None]:
fig = plot_training_history(train_history=ft_train_history, val_history=ft_val_history, model_name="BERT-ADU-CLS", phase_name="FINE-TUNE", out_dir=OUT_DIR)
fig.show()

In [None]:
import gc
model = None

gc.collect()
torch.cuda.empty_cache()
torch.cuda.mem_get_info("cuda:0")

In [None]:
!nvidia-smi

# Test the model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(f"{OUT_DIR}/fine_tuned/best", num_labels=N_CLASSES)

print("Loaded best fine-tuned model...")

summary(model)

In [None]:
accelerator = Accelerator()

model, test_dataloader = accelerator.prepare(model, test_dataloader)

In [None]:
(test_target, test_preds, test_probs, test_ids), test_loss = epoch_iter(
    dataloader=test_dataloader,
    model=model,
    is_train=False,
)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_true=test_target, y_pred=test_preds, target_names=le.classes_.tolist()))

In [None]:
def getErrors(adu_dataframe, adu_ids, y_true, y_pred):
    df = pd.DataFrame(columns=["id", "adu", "target", "predicted"])
    for (aduId, correct, predicted) in zip(adu_ids, y_true, y_pred):
        if correct == predicted:
            continue
        row = {
            "id": aduId,
            "adu": adu_dataframe.iloc[aduId]["tokens"],
            "target": correct,
            "predicted": predicted
        }
        
        df = df.append(row, ignore_index=True)
    return df

errors_df = getErrors(adus, test_ids, le.inverse_transform(test_target), le.inverse_transform(test_preds))
errors_df

In [None]:
errors_df.to_csv(f"{OUT_DIR}/test_errors.csv", index=False)