In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [38]:
!pip install fasttext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2
  Using cached pybind11-2.10.4-py3-none-any.whl (222 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4393337 sha256=745913974314299b953ea40b47c490329f92a786b7e245b6963c19c7a432e2c1
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.10.4


In [39]:
! git clone https://github.com/eventdata/ConfliBERT.git 

fatal: destination path 'ConfliBERT' already exists and is not an empty directory.


In [41]:
import os
from string import Template
import random
import json

import torch
import torch.nn.functional as F
import numpy as np
import pandas as pd
import fasttext
from transformers import AutoTokenizer, RobertaForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, pipeline
from huggingface_hub import hf_hub_download
from datasets import Dataset, load_dataset

In [23]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

set_seed(5)

Random seed set as 5


In [6]:
# defining available compute device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [24]:
 # training and data config
file_path = "/content/ConfliBERT/data/BBC_News/"
template_string = "$x This news is $y"
# label_words_dict = {0:["good"], 1:["bad"]}
labels_file = "/content/binary_sentiment.json"

# model and tokenizer config
tokenizer_config = "roberta-base"
model_config = "roberta-base"

In [8]:
# load train/test data with huggingface Dataset

dataset = load_dataset("csv", data_dir=file_path, column_names=["article", "label"], sep="\t")



  0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'label'],
        num_rows: 1588
    })
    validation: Dataset({
        features: ['article', 'label'],
        num_rows: 315
    })
    test: Dataset({
        features: ['article', 'label'],
        num_rows: 322
    })
})

In [10]:
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_config,
    use_fast=False,
    truncation_side="left"
  )

In [27]:
def transform_article(batch, tokenizer, template, label_words_dict, test=False):
  # function to map text to required prompt template
  text = [
    template.substitute(
        x=article,
        y=tokenizer.mask_token
      ) for article in batch["article"]
  ]
  text_target = [
    template.substitute(
        x=article,
        y=np.random.choice(
            label_words_dict[str(batch["label"][i])]
        )
      ) for i, article in enumerate(batch["article"])
  ]

  if test:
    return {"text":text, "text_target":text_target}

  result = tokenizer(
    text=text,
    text_target=text_target,
    truncation=True,
    padding=True,
    return_tensors="pt"
  )

  # result = tokenizer(
  #     [
  #         template.substitute(
  #             x=article,
  #             y=tokenizer.mask_token
  #           ) for article in batch["article"]
  #     ],
  #     truncation=True,
  #     padding=True,
  #     return_tensors="pt"
  #   )

  masks = torch.where(result["input_ids"] == tokenizer.mask_token_id)  
  _temp_labels = result["labels"].clone()
  _temp_labels[:] = -100
  _temp_labels[masks] = result["labels"][masks]
  result["labels"] = _temp_labels
  
  # result["labels"] = tokenizer(
  #     [
  #         template.substitute(
  #             x=article,
  #             y=np.random.choice(
  #                 label_words_dict[str(batch["label"][i])]
  #               )
  #          ) for i, article in enumerate(batch["article"])
  #     ],
  #     truncation=True,
  #     padding=True
  #   )["input_ids"] 

  # result = tokenizer(batch["article"], truncation=True, padding=True)
  # result["labels"] = result["input_ids"].copy()
  return result

In [6]:
with open(labels_file) as input_file:
  label_words_dict = json.loads(input_file.read())

template = Template(template_string)

In [13]:
tokenized_train = dataset["train"].map(
    transform_article,
    num_proc=4,
    batched=True,
    fn_kwargs={"template":template, "label_words_dict":label_words_dict, "tokenizer":tokenizer},
    remove_columns=dataset.column_names["train"]
)



In [14]:
tokenized_train

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1588
})

In [15]:
tokenized_valid = dataset["validation"].map(
    transform_article,
    num_proc=4,
    batched=True,
    fn_kwargs={"template":template, "label_words_dict":label_words_dict, "tokenizer":tokenizer},
    remove_columns=dataset.column_names["validation"]
)



In [16]:
model = RobertaForMaskedLM.from_pretrained(model_config).to(device)

In [17]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [24]:
train_args = TrainingArguments(
    output_dir="mlm-"+model_config,
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=0.01,
    per_device_train_batch_size=2,
)

In [22]:
trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [23]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,No log,2.336831


TrainOutput(global_step=8, training_loss=2.818885326385498, metrics={'train_runtime': 960.1787, 'train_samples_per_second': 0.017, 'train_steps_per_second': 0.008, 'total_flos': 4212247412736.0, 'train_loss': 2.818885326385498, 'epoch': 0.01})

In [30]:
tokenized_test = dataset["test"].map(
    transform_article,
    num_proc=4,
    batched=True,
    fn_kwargs={"template":template, "label_words_dict":label_words_dict, "tokenizer":tokenizer, "test":True},
    remove_columns=["article"]
)

Map (num_proc=4):   0%|          | 0/322 [00:00<?, ? examples/s]

In [31]:
tokenized_test[0]

{'label': 0,
 'text': 'ocean s twelve raids box office ocean s twelve  the crime caper sequel starring george clooney  brad pitt and julia roberts  has gone straight to number one in the us box office chart.  it took $40.8m (PS21m) in weekend ticket sales  according to studio estimates. the sequel follows the master criminals as they try to pull off three major heists across europe. it knocked last week s number one  national treasure  into third place. wesley snipes  blade: trinity was in second  taking $16.1m (PS8.4m). rounding out the top five was animated fable the polar express  starring tom hanks  and festive comedy christmas with the kranks.  ocean s twelve box office triumph marks the fourth-biggest opening for a december release in the us  after the three films in the lord of the rings trilogy. the sequel narrowly beat its 2001 predecessor  ocean s eleven which took $38.1m (PS19.8m) on its opening weekend and $184m (PS95.8m) in total. a remake of the 1960s film  starring frank

In [32]:
mask_filler = pipeline("fill-mask", model, tokenizer = tokenizer)

In [33]:
mask_fills = mask_filler(tokenized_test["text"][:3])

In [35]:
predicted_tokens = [sample["token_str"] for sample in mask_fills]

TypeError: ignored

In [2]:
model_path = hf_hub_download(repo_id="facebook/fasttext-en-vectors", filename="model.bin")
model = fasttext.load_model(model_path)



In [32]:
label_vectors = torch.empty(len(label_words_dict), model.get_dimension())
for i, label in enumerate(label_words_dict):
  label_vectors[i] = torch.mean(torch.tensor(
      [
        model.get_word_vector(word) for word in label_words_dict[label]
      ]
    ),
    dim=0
  )

In [33]:
label_vectors.shape

torch.Size([2, 300])

In [58]:
word_vec = torch.tensor([ model.get_word_vector(word) for word in predicted_tokens])

In [59]:
word_vec.shape

torch.Size([3, 300])

In [63]:
def cosine_sim_matrix(a, b, eps=1e-8):
    # added eps for numerical stability
    a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None]
    a_norm = a / torch.max(a_n, eps * torch.ones_like(a_n))
    b_norm = b / torch.max(b_n, eps * torch.ones_like(b_n))
    sim_mt = torch.mm(a_norm, b_norm.transpose(0, 1))
    return sim_mt

In [67]:
predictions = cosine_sim_matrix(word_vec, label_vectors).argmax(1)

tensor([0, 1, 1])

In [None]:
# accuracy
correct = (predicted == test["label"]).sum()
acc = 100 * correct/len(correct)