In [None]:
import pandas as pd

from segtok.segmenter import split_single
from segtok.tokenizer import split_contractions, word_tokenizer
from transformers import BertTokenizer, BertModel
import torch
import pandas as pd
import csv
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.utils import data
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import classification_report
import tqdm
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
def chad_tokenizer(sentence):
    sentence = sentence.replace('(',' ')
    sentence = sentence.replace(')',' ')
    words = []
    sentences = split_single(sentence)
    for sentence in sentences:
        contractions = split_contractions(word_tokenizer(sentence))
        words.extend(contractions)
    return words

def get_pos(sent_list,aspect_list):
    first_pos = sent_list.index(aspect_list[0])
    final_pos = []
    for i in range(0,len(aspect_list)):
        final_pos.append(first_pos+i)
    return final_pos    

class TalkLitDataset(data.Dataset):
    def __init__(self, tagged_sents):
        ids, sents, aspects = [], [], [] # list of lists
        tokenizer = BertTokenizer.from_pretrained("emanjavacas/MacBERTh", truncation=True)
        counter = 0
        for num, sent in tagged_sents.iterrows():
            id, sentence, aspect = sent['Filename'], sent['Caption'], sent['Aspect']
            sent_tokens = tokenizer.encode(sentence, truncation=True)
            aspect_tokens = tokenizer.encode(aspect, truncation=True)[1:-1]
            try:
                pos_aspects = get_pos(sent_tokens, aspect_tokens)
            except:
                counter+=1
                pos_aspects = -1

            sents.append(sent_tokens)
            aspects.append(pos_aspects)
            ids.append(id)
        
        print("{} BUGGY ASPECTS DETECTED".format(counter))
            
        self.ids, self.sents, self.aspects = ids, sents, aspects

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, idx):
        id, words, aspects = self.ids[idx], self.sents[idx], self.aspects[idx] # words, tags: string list
        return id, words, aspects


#Load neural network (BERT)
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = BertModel.from_pretrained("emanjavacas/MacBERTh", output_hidden_states=True)
        self.device = device

    def forward(self, sent, aspects):
        if aspects == -1:
            bert_embeds = None
        else:    
            sent = torch.LongTensor(sent).to(device)
            input_ids = sent.unsqueeze(0)  # Batch size 1
            with torch.no_grad():
                outputs = self.model(input_ids)
                last_hidden_states = outputs.last_hidden_state[0]  # The last hidden-state is the first element of the output tuple
                start = 0
                end = len(last_hidden_states)-1
                context_window = 5 #add context to embedding calculations
                
                if aspects[0]-context_window>0:
                    start = aspects[0]-context_window
                if aspects[-1]+context_window<len(last_hidden_states)-1:
                    end = aspects[-1]+context_window
                    
                all_aspects = []
                for i in range(start,end):
                    all_aspects.append(i)
                bert_embeds = torch.zeros(len(all_aspects),768)
                for i, aspect in enumerate(all_aspects):
                    bert_embeds[i] = last_hidden_states[aspect]
                    
                bert_embeds = torch.mean(bert_embeds, axis=0)
        return bert_embeds
    
    
# the BERT embeddings are not updated or fine-tuned during the forward pass, making them static within the scope of the provided code.

NameError: name 'data' is not defined

In [18]:
if __name__ == '__main__':
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print("CREATING FEATURE FILES")
    df = pd.read_csv('/home/tess/experiments/gold_aspects.csv')
    df = df.dropna()

    test_dataset = TalkLitDataset(df)
    test_iter = data.DataLoader(dataset=test_dataset,
                                batch_size=1,
                                shuffle=False,
                                num_workers=1)

    model = Net()
    model.to(device)

    X_test = []

    for i, batch in tqdm.tqdm(enumerate(test_iter)):
            id, words, aspects = batch
            bert_embeds = model(words, aspects)
            if bert_embeds is not None:
                bert_embeds = bert_embeds.cpu().numpy()
            else:
                bert_embeds = "ERROR"
            X_test.append(bert_embeds)

CREATING FEATURE FILES


(…)njavacas/MacBERTh/resolve/main/vocab.txt: 100%|██████████| 227k/227k [00:00<00:00, 12.2MB/s]
(…)avacas/MacBERTh/resolve/main/config.json: 100%|██████████| 481/481 [00:00<00:00, 192kB/s]
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


0 BUGGY ASPECTS DETECTED


pytorch_model.bin: 100%|██████████| 439M/439M [00:10<00:00, 41.7MB/s] 
76752it [1:23:10, 15.38it/s]


In [19]:
with open('outputs/features_MacBERTh.pkl','wb') as f:
        pickle.dump(X_test, f)

In [16]:
len(X_test)

print(df.iloc[0])
print(test_dataset[0])
print(X_test[0]) #BERT embedding of the sentence 

Aspect                                                     ''
Caption     '' CHAPTER III THE CUMÆAN SIBYL A part of the ...
Filename                                                    0
labels_y                                                    O
Name: 0, dtype: object
(0, [101, 112, 112, 8203, 2684, 7462, 140, 25810, 28184, 14962, 156, 27954, 3663, 2162, 138, 1226, 1104, 1103, 19863, 12355, 23901, 3153, 118, 1413, 1104, 8619, 8559, 1154, 1103, 6553, 9627, 2894, 1103, 1832, 1120, 21208, 119, 102], [1, 2])
[ 3.07906389e-01  2.07098231e-01 -3.95558566e-01  4.45858866e-01
  2.53102928e-01 -1.16396435e-01  2.16603354e-01 -9.65527147e-02
  1.36593282e-01  5.24048992e-02 -2.08192561e-02  6.31862506e-02
  3.85382250e-02  3.15549582e-01 -1.81003407e-01  7.04114418e-03
 -1.48144960e-01 -4.51527275e-02  2.08337769e-01 -3.54403816e-02
 -9.70912725e-02 -7.29035661e-02 -2.18958691e-01 -6.98462650e-02
  1.45829290e-01 -8.49292278e-02  1.06508069e-01  2.29225889e-01
 -2.20616460e-01  2.52017766e-01 