In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install fasttext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
! git clone https://github.com/eventdata/ConfliBERT.git 

fatal: destination path 'ConfliBERT' already exists and is not an empty directory.


In [5]:
import os
from string import Template
import random
import json

import torch
import torch.nn.functional as F
import numpy as np
import pandas as pd
import fasttext
from transformers import AutoTokenizer, AutoModel, RobertaForMaskedLM, BertForMaskedLM, Trainer, GPT2LMHeadModel, TrainingArguments, DataCollatorForLanguageModeling, pipeline
from huggingface_hub import hf_hub_download
from datasets import Dataset, load_dataset

In [6]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

set_seed(5)

Random seed set as 5


In [7]:
# defining available compute device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [8]:
 # training and data config
file_path = "/content/ConfliBERT/data/BBC_News/"
template_string = "$x This news is $y"
# label_words_dict = {0:["good"], 1:["bad"]}
labels_file = "/content/binary_sentiment.json"

# model and tokenizer config
# tokenizer_config = "roberta-base"
# model_config = "roberta-base"

tokenizer_config = "bert-base-uncased"
model_config = "bert-base-uncased"

In [9]:
# load train/test data with huggingface Dataset

dataset = load_dataset("csv", data_dir=file_path,  column_names=["article", "label"], sep="\t")



  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_config,
    use_fast=False,
    truncation_side="left"
  )

# use for gpt2
# tokenizer = AutoTokenizer.from_pretrained(
#     tokenizer_config,
#     use_fast=False,
#     truncation_side="left",
#     padding_side="left"
#   )

In [11]:
def transform_article(batch, tokenizer, template, label_words_dict, mlm=True, test=True, model_max_dim=512):
  # function to map text to required prompt template
  
  mask = tokenizer.mask_token if mlm else ""
  
  text = [
    template.substitute(
        x=article,
        y=mask
      ) for article in batch["article"]
  ]
  text_target = [
    template.substitute(
        x=article,
        y=np.random.choice(
            label_words_dict[str(batch["label"][i])]
        )
      ) for i, article in enumerate(batch["article"])
  ]

  if test == True:
    return {"text":[article[-512:] for article in text],
            "text_target":[target[-512:] for target in text_target]}

  result = tokenizer(
    text=text,
    text_target=text_target,
    truncation=True,
    padding=True,
    return_tensors="pt"
  )

  masks = torch.where(result["input_ids"] == tokenizer.mask_token_id)  
  _temp_labels = result["labels"].clone()
  _temp_labels[:] = -100
  _temp_labels[masks] = result["labels"][masks]
  result["labels"] = _temp_labels
  
  return result

In [12]:
with open(labels_file) as input_file:
  label_words_dict = json.loads(input_file.read())

template = Template(template_string)

In [13]:
# model = RobertaForMaskedLM.from_pretrained(model_config).to(device)
model = BertForMaskedLM.from_pretrained(model_config).to(device)
# model = GPT2LMHeadModel.from_pretrained(model_config).to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
tokenized_test = dataset["test"].map(
    transform_article,
    num_proc=4,
    batched=True,
    fn_kwargs={"template":template, "label_words_dict":label_words_dict, "tokenizer":tokenizer, "test":True, "mlm":True}
)



In [15]:
mask_filler = pipeline("fill-mask", model, tokenizer=tokenizer, device=device)

# text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)
# text_generator.tokenizer.pad_token_id = model.config.eos_token_id

In [16]:
torch.cuda.empty_cache()

In [17]:
mask_fills = mask_filler(tokenized_test["text"][:50], batch_size=100)
# generations = text_generator(tokenized_test["text"], num_return_sequences=4, batch_size=20)

In [18]:
mask_fills[10]
# generations[10]

[{'score': 0.6645217537879944,
  'token': 1012,
  'token_str': '.',
  'sequence': 'defence cordon was slowly disintegrating. england prop matt stevens ran in at full steam to suck in a few more tacklers. unfortunately he ran into o connell who hit him hard - very hard - and then wrestled the ball away for a crucial turnover. that spoke volumes about ireland s back - foot display with defensive coach mike ford taking a bow at the end. to win a game like that showed that ireland have moved forward. it may be tries that win games but it is defence that wins championships. this news is.'},
 {'score': 0.1856451779603958,
  'token': 1025,
  'token_str': ';',
  'sequence': 'defence cordon was slowly disintegrating. england prop matt stevens ran in at full steam to suck in a few more tacklers. unfortunately he ran into o connell who hit him hard - very hard - and then wrestled the ball away for a crucial turnover. that spoke volumes about ireland s back - foot display with defensive coach mike

In [19]:
print(mask_fills[10][0])

{'score': 0.6645217537879944, 'token': 1012, 'token_str': '.', 'sequence': 'defence cordon was slowly disintegrating. england prop matt stevens ran in at full steam to suck in a few more tacklers. unfortunately he ran into o connell who hit him hard - very hard - and then wrestled the ball away for a crucial turnover. that spoke volumes about ireland s back - foot display with defensive coach mike ford taking a bow at the end. to win a game like that showed that ireland have moved forward. it may be tries that win games but it is defence that wins championships. this news is.'}


In [20]:
predicted_tokens = [sample[0]["token_str"] for sample in mask_fills]

In [21]:
model_path = hf_hub_download(repo_id="facebook/fasttext-en-vectors", filename="model.bin")
ft_model = fasttext.load_model(model_path)



In [23]:
label_vectors = torch.empty(len(label_words_dict), ft_model.get_dimension())
for i, label in enumerate(label_words_dict):
  label_vectors[i] = torch.mean(torch.tensor(
      [
        ft_model.get_word_vector(word) for word in label_words_dict[label]
      ]
    ),
    dim=0
  )

  label_vectors[i] = torch.mean(torch.tensor(


In [25]:
word_vec = torch.tensor([ ft_model.get_word_vector(word) for word in predicted_tokens])

In [26]:
def cosine_sim_matrix(a, b, eps=1e-8):
    # added eps for numerical stability
    a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None]
    a_norm = a / torch.max(a_n, eps * torch.ones_like(a_n))
    b_norm = b / torch.max(b_n, eps * torch.ones_like(b_n))
    sim_mt = torch.mm(a_norm, b_norm.transpose(0, 1))
    return sim_mt

In [27]:
similarity = cosine_sim_matrix(word_vec, label_vectors)

In [28]:
for i,sim in enumerate(similarity):
    print(i, sim)

0 tensor([0.1330, 0.1432])
1 tensor([0.1330, 0.1432])
2 tensor([0.1330, 0.1432])
3 tensor([0.1330, 0.1432])
4 tensor([0.1330, 0.1432])
5 tensor([0.1330, 0.1432])
6 tensor([0.1330, 0.1432])
7 tensor([0.1330, 0.1432])
8 tensor([0.1330, 0.1432])
9 tensor([0.1330, 0.1432])
10 tensor([0.1330, 0.1432])
11 tensor([0.1330, 0.1432])
12 tensor([0.1330, 0.1432])
13 tensor([0.1330, 0.1432])
14 tensor([0.1330, 0.1432])
15 tensor([0.1330, 0.1432])
16 tensor([0.1330, 0.1432])
17 tensor([0.1330, 0.1432])
18 tensor([0.1330, 0.1432])
19 tensor([0.1330, 0.1432])
20 tensor([0.1330, 0.1432])
21 tensor([0.1330, 0.1432])
22 tensor([0.1330, 0.1432])
23 tensor([0.1330, 0.1432])
24 tensor([0.1330, 0.1432])
25 tensor([0.1330, 0.1432])
26 tensor([0.1330, 0.1432])
27 tensor([0.1330, 0.1432])
28 tensor([0.0693, 0.1239])
29 tensor([0.1330, 0.1432])
30 tensor([0.0693, 0.1239])
31 tensor([0.1330, 0.1432])
32 tensor([0.0536, 0.1051])
33 tensor([0.1330, 0.1432])
34 tensor([0.1330, 0.1432])
35 tensor([0.1330, 0.1432])
36

In [29]:
similarity.max(1).values.mean()

tensor(0.1417)

In [30]:
predictions = similarity.argmax(1)

In [31]:
# accuracy
correct = (predictions == torch.tensor(tokenized_test["label"])[:50]).sum()
acc = 100 * correct/len(predictions)

In [32]:
acc

tensor(18.)

In [33]:
from sklearn.metrics import f1_score

In [34]:
f1_score(tokenized_test["label"][:50], predictions)

0.3050847457627119

In [35]:
tokenized_test["label"][0]

0