In [1]:
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
from atel.data import BookCollection
from bertviz import head_view
from sklearn.model_selection import KFold
from data_clean import *
import yaml
from yaml import CLoader

In [2]:
%%javascript
require.config({
  paths: {
      d3: '//cdnjs.cloudflare.com/ajax/libs/d3/3.4.8/d3.min',
      jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
  }
});

<IPython.core.display.Javascript object>

In [3]:
def find_text_info(df, splits, text):
    
    search = df[df['text'].str.contains(text)]
    assert len(search) == 1, 'The search found more than 1 text'
    
    print(search['text'].values)
    
    idx = df[df['text'].str.contains(text)].index[0]
    for i, (_, val_idx) in enumerate(splits):
        if idx in val_idx:
            cv = i+1
            print(f'Present in CV {i+1}')
    
    return idx, cv

In [4]:
def choose_model(target: str, cv: int=1):
    model_path = f'../../../../../work3/s173991/huggingface_saves/BERT_mlm_gyldendal/version_2'\
                +f'/{target}/BERT-BS16-BA4-ep100-seed42-WD0.01-LR2e-05/CV_{cv}/checkpoint-1100/'
    
    print(model_path)
    
    return model_path

In [36]:
SEED = 42
NUM_SPLITS = 10
OUTPUT_ATTENTION = True
TARGET = 'Semantisk univers'

In [37]:
set_seed(SEED)
with open('target_info.yaml', 'r', encoding='utf-8') as f:
    target_info = yaml.load(f, Loader=CLoader)

problem_type = target_info[TARGET]['problem_type']
NUM_LABELS   = target_info[TARGET]['num_labels']

if problem_type == 'multilabel':
    multilabel = True
    p_t = "multi_label_classification"
    logit_func = torch.nn.Sigmoid()
    
else:
    multilabel = False
    p_t = "single_label_classification"
    logit_func = torch.nn.Softmax()

Seed has been set to 42


In [38]:
book_col = BookCollection(data_file="./data/book_col_271120.pkl")

df, labels = get_pandas_dataframe(book_col, TARGET)

kf = KFold(n_splits=NUM_SPLITS, shuffle=True, random_state=SEED)
all_splits = [k for k in kf.split(df)]

label2id = dict(zip(labels, range(NUM_LABELS)))
id2label = dict(zip(range(NUM_LABELS), labels))

idx, cv = find_text_info(df, all_splits, 'Enhjørninger er søde og flotte dyr selvom de ikke findes')
model_path = choose_model(TARGET, cv)

Loaded from disk: ./data/book_col_271120.pkl
['Enhjørninger er søde og flotte dyr selvom de ikke findes. Nogen har horn og nogen har vinger. De elsker at lege og spise.']
Present in CV 3
../../../../../work3/s173991/huggingface_saves/BERT_mlm_gyldendal/version_2/Semantisk univers/BERT-BS16-BA4-ep100-seed42-WD0.01-LR2e-05/CV_3/checkpoint-1100/


In [39]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [40]:
dataset = Dataset.from_pandas(df.reset_index())
token_dataset = dataset.map(tokenize_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [41]:
token_dataset

Dataset({
    features: ['index', 'text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 778
})

In [52]:
model = AutoModelForSequenceClassification.from_pretrained(model_path, 
                                                           num_labels=NUM_LABELS, 
                                                           problem_type=p_t,
                                                           label2id=label2id,
                                                           id2label=id2label,
                                                           output_attentions=OUTPUT_ATTENTION)

In [53]:
inputs = token_dataset.select([idx])['input_ids']
inputs = torch.tensor(inputs)

In [54]:
## There is two ways to do the input. Either you remove the [PAD] tokens first. 
## Or you pass an attention mask with the input

# inputs = inputs[:, inputs[0] != 0]
attention_mask = torch.tensor(token_dataset.select([idx])['attention_mask'])

In [59]:
tokens = tokenizer.convert_ids_to_tokens(inputs[0])  # Convert input ids to token strings

outputs = model(inputs, attention_mask=attention_mask) # attention_mask=attention_mask  # Run model
attention = outputs[-1] # Retrieve attention from model outputs

In [60]:
## As BertViz doesn't scale well with large attention sizes, 
## we have to make the attention size small, when using attention masks
new_att = []
N = tokens.index('[PAD]') # 32
tokens = tokens[:N]
for i, att in enumerate(attention):
    new_att.append(att[:, :, :N, :N])

In [61]:
head_view(new_att, tokens)  # Display model view

<IPython.core.display.Javascript object>

In [48]:
prob = logit_func(outputs[0])
prob

tensor([[0.0030, 0.9959, 0.0156, 0.0053, 0.0031]], grad_fn=<SigmoidBackward0>)

In [49]:
(prob>0.5).int()

tensor([[0, 1, 0, 0, 0]], dtype=torch.int32)

In [50]:
labels

['Det nære', 'Dyr og natur', 'Menneskeliv', 'Oplevelse', 'Ubestemmelig']