# Import libraries

In [19]:
import jsonlines
import numpy as np
import os
import pandas as pd
import random
import re
import torch
import wandb

from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from transformers import (
    BertForSequenceClassification,
    BertModel,
    BertTokenizer,
)
from transformers.modeling_outputs import SequenceClassifierOutput
from typing import Optional


In [20]:
if torch.cuda.is_available():
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU available")


GPU is available: NVIDIA GeForce RTX 4070


In [21]:
# Force CUDA device if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# Define Pipeline 

## Read data

In [22]:
def preprocess(text):
    """Preprocess"""

    stopwords = []
    extra_stopwords = [
        # 'LINK',
        # 'USER',
        "RT",
        "@",
    ]
    stopwords = list(extra_stopwords)

    pattern = r"(?i)\b(?:" + "|".join(re.escape(word) for word in stopwords) + r")\b"
    text = re.sub(pattern, "", text).strip()

    return text


In [23]:
def read_data_1b(train_file, preprocessing: bool, unique_labels):
    """Read data for the task 1B"""
    train_texts = []
    train_labels = []
    train_types = []

    mlb = MultiLabelBinarizer(classes=list(unique_labels))

    train_ids = []

    with jsonlines.open(train_file) as train_f:
        for obj in tqdm(train_f, desc="Processing", unit="line"):
            doc_id = str(obj["id"])
            labels = obj["labels"]
            labels = mlb.fit_transform([set(labels)]).tolist()[0]
            train_labels.append(labels)

            train_texts.append(
                preprocess(obj["text"]) if preprocessing else obj["text"]
            )
            types = 1 if obj["type"] == "tweet" else 0
            train_types.append(types)

            train_ids.append(obj["id"])

    return train_texts, train_labels, train_types, train_ids

# Define model and loss function


In [24]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, logits=False, reduce=True):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.logits = logits
        self.reduce = reduce

    def forward(self, inputs, targets):
        BCE_loss = nn.CrossEntropyLoss()(inputs, targets)
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss
        if self.reduce:
            return torch.mean(F_loss)
        else:
            return F_loss

In [25]:
class BertForMultitaskClassification(BertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        self.bert = BertModel(config)
        classifier_dropout = (
            config.classifier_dropout
            if config.classifier_dropout is not None
            else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.classifier2 = nn.Linear(config.hidden_size, 2)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        labels_types: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        logits2 = self.classifier2(pooled_output)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (
                    labels.dtype == torch.long or labels.dtype == torch.int
                ):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        ), logits2


In [26]:
class BertMultilabelClassifier:
    def __init__(
        self,
        model_path,
        num_classes,
        device="cuda" if torch.cuda.is_available() else "cpu",
    ):
        self.model = BertForMultitaskClassification.from_pretrained(
            model_path,
            num_labels=num_classes,
            problem_type="multi_label_classification",
        )
        self.tokenizer = BertTokenizer.from_pretrained(model_path)
        self.device = torch.device(device)
        self.model.to(self.device)
        self.num_classes = num_classes

    def tokenize_texts(self, texts):
        encoded_texts = self.tokenizer(
            texts, padding=True, truncation=True, return_tensors="pt"
        )
        input_ids = encoded_texts["input_ids"].to(self.device)
        attention_mask = encoded_texts["attention_mask"].to(self.device)

        return input_ids, attention_mask

    def train(
        self, texts, labels, labels_types, batch_size=8, epochs=4, learning_rate=2e-5
    ):
        input_ids, attention_mask = self.tokenize_texts(texts)

        labels = torch.tensor(labels).to(torch.float).to(self.device)
        labels_types = torch.tensor(labels_types).to(torch.long).to(self.device)

        dataset = TensorDataset(input_ids, attention_mask, labels, labels_types)

        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        optimizer = torch.optim.AdamW(self.model.parameters(), lr=learning_rate)
        loss_function = FocalLoss()
        # loss_function = BCEWithLogitsLoss()
        for epoch in tqdm(range(epochs), desc="Running epoch "):
            self.model.train()
            total_loss = 0.0
            for batch in dataloader:
                batch = tuple(t.to(self.device) for t in batch)
                input_ids, attention_mask, labels, labels_types = batch

                optimizer.zero_grad()

                outputs, logits2 = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels.float(),
                    labels_types=labels_types.float(),
                )
                logits = outputs.logits
                # logits = torch.tensor(logits).to(torch.long).to(self.device)

                loss1 = (
                    outputs.loss
                )  # loss_function(logits.view(-1, self.num_classes), labels)

                loss_fct = CrossEntropyLoss()
                loss2 = loss_fct(logits2.view(-1, 2), labels_types.view(-1))
                loss = loss1 + loss2
                # loss = loss_function(logits, labels)

                wandb.log({"loss": loss})

                total_loss += loss.item()

                loss.backward()
                optimizer.step()

            avg_loss = total_loss / len(dataloader)
            print(f"Epoch {epoch + 1}/{epochs} - Average Loss: {avg_loss}")

    def predict(self, texts, threshold=0.5):
        input_ids, attention_mask = self.tokenize_texts(texts)
        with torch.no_grad():
            logits, _ = self.model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.sigmoid(logits.logits)

            predicted_labels = (probs > threshold).int()

            zero_rows = predicted_labels.sum(dim=1) == 0
            predicted_labels[zero_rows, -3] = 1

        return predicted_labels, probs

In [27]:
def evaluate(pred_labels, gold_labels, subtask, techniques=None):
    """
    Evaluates the predicted classes w.r.t. a gold file.
    Metrics are:  macro_f1 nd micro_f1
    :param pred_labels: a dictionary with predictions,
    :param gold_labels: a dictionary with gold labels.
    """
    pred_values, gold_values = pred_labels, gold_labels

    # We are scoring for subtask 1B
    if subtask == "1B":
        mlb = MultiLabelBinarizer()
        mlb.fit([techniques])
        gold_values = mlb.transform(gold_values)
        # pred_values = mlb.transform(pred_values)

    micro_f1 = f1_score(gold_values, pred_values, average="micro")
    macro_f1 = f1_score(gold_values, pred_values, average="macro")

    return micro_f1, macro_f1

# Define pipeline


In [28]:
def pipeline(hyper_params: dict):
    # create output directories if they don't exist
    OUTPUT_DIR = "output/task1B"

    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    wandb.init(
        project="araieval_subtaskB",
        config=hyper_params,
    )

    train_file = "task1B_train.jsonl"
    test_file = "task1B_dev.jsonl"

    # check if the data files exist
    if not os.path.exists(train_file):
        raise FileNotFoundError(f"File not found: {train_file}")
    if not os.path.exists(test_file):
        raise FileNotFoundError(f"File not found: {test_file}")

    # read the data
    df = pd.read_json(train_file, lines=True)

    unique_labels = set()
    for label_list in df["labels"]:
        unique_labels.update(label_list)

    train_texts, train_labels, train_types, trains_ids = read_data_1b(
        train_file,
        preprocessing=hyper_params["preprocessing"],
        unique_labels=unique_labels,
    )
    test_texts, test_labels, test_types, test_ids = read_data_1b(
        test_file,
        preprocessing=hyper_params["preprocessing"],
        unique_labels=unique_labels,
    )

    num_classes = len(unique_labels)

    # instantiate a model
    model_name = hyper_params["model_name"]
    classifier = BertMultilabelClassifier(model_name, num_classes)

    classifier.train(
        texts=train_texts,
        labels=train_labels,
        labels_types=train_types,
        batch_size=hyper_params["batch_size"],
        epochs=hyper_params["epochs"],
        learning_rate=hyper_params["learning_rate"],
    )

    predicted_labels, probabilities = classifier.predict(test_texts)
    micro_f1, macro_f1 = evaluate(
        predicted_labels.cpu().tolist(), test_labels, subtask="1A"
    )
    print("micro-F1={:.4f}\tmacro-F1={:.4f}".format(micro_f1, macro_f1))
    # micro_f1, macro_f1 = float(0), float(0)

    run_name = wandb.run.name

    def find_indices(list_to_check, item_to_find):
        return [idx for idx, value in enumerate(list_to_check) if value == item_to_find]

    with open(f"{OUTPUT_DIR}/task1B_output_{run_name}.tsv", "w") as file:
        file.write("id\tlabel\n")
        for id, pred in zip(test_ids, predicted_labels.cpu().tolist()):
            r = find_indices(pred, 1)

            label = [list(unique_labels)[item] for item in r]

            file.write(str(id) + "\t" + ",".join(label) + "\n")

            # file.write(str(id)+"\t"+str(label)+"\n")

    return {"micro_f1": micro_f1, "macro_f1": macro_f1}

# Run Pipeline

In [29]:
params = {
    "model_name": "UBC-NLP/MARBERT",
    "loss_type": "bce",
    "model_archi": "multitask",
    "seed": 42,
    "learning_rate": 1e-05,
    "epochs": 5,
    "batch_size": 8,
    "preprocessing": False,
    "stopwords_size": 0,
}

In [30]:
def set_seed(seed):
    """
    Set the seed for random number generation in Python's random, numpy, and torch libraries.
    If a GPU is available, it also sets the seed for random number generation on the GPU.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

In [31]:
set_seed(params["seed"])

In [32]:
metrics = pipeline(params)
wandb.log(metrics)
wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mtaqiyeddine[0m. Use [1m`wandb login --relogin`[0m to force relogin


Processing: 2427line [00:00, 17236.98line/s]
Processing: 259line [00:00, 16702.67line/s]
Some weights of BertForMultitaskClassification were not initialized from the model checkpoint at UBC-NLP/MARBERT and are newly initialized: ['classifier.bias', 'classifier.weight', 'classifier2.bias', 'classifier2.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Running epoch :  20%|██        | 1/5 [00:24<01:37, 24.41s/it]

Epoch 1/5 - Average Loss: 0.551929550833608


Running epoch :  40%|████      | 2/5 [00:48<01:12, 24.32s/it]

Epoch 2/5 - Average Loss: 0.27655986387674747


Running epoch :  60%|██████    | 3/5 [01:13<00:48, 24.33s/it]

Epoch 3/5 - Average Loss: 0.22352978567543783


Running epoch :  80%|████████  | 4/5 [01:37<00:24, 24.32s/it]

Epoch 4/5 - Average Loss: 0.21284728085524157


Running epoch : 100%|██████████| 5/5 [02:01<00:00, 24.28s/it]

Epoch 5/5 - Average Loss: 0.19957419477501198





micro-F1=0.4595	macro-F1=0.0337


0,1
loss,██▅▃▄▂▂▃▂▂▂▂▂▄▂▂▂▂▂▁▁▁▁▁▁▁▂▂▁▄▁▁▁▁▁▁▂▁▁▁
macro_f1,▁
micro_f1,▁

0,1
loss,0.14814
macro_f1,0.03372
micro_f1,0.45953
