# Preparation

## Set Seed and CUDA

In [33]:
import torch
import torchdata
import torchtext
from nltk.corpus import stopwords

SEED = 1234
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

print("PyTorch Version: ", torch.__version__)
print("torchtext Version: ", torchtext.__version__)
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.")

PyTorch Version:  1.11.0+cu113
torchtext Version:  0.12.0
Using GPU.


# Load Dataset

In [34]:
from datasets import load_dataset

dataset = load_dataset("surrey-nlp/PLOD-filtered")

## Data Prep

In [35]:
training_set = dataset["train"]
print(len(training_set))
validation_set = dataset["validation"]
print(len(validation_set))
testing_set = dataset["test"]
print(len(testing_set))


112652
24140
24140


In [36]:
label_list = ["B-O", "B-AC", "B-LF", "I-LF"]
labels_vocab = {
    "B-O": 0,
    "B-AC": 1,
    "B-LF": 2,
    "I-LF": 3,
}

labels_vocab_reverse = {v:k for k,v in labels_vocab.items()}

In [37]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


In [38]:
def calculate_word_stats(doc):
    vocab_lengths = [len(word) for word in doc['tokens']]
    avg_vocab_length = sum(vocab_lengths) / len(vocab_lengths)
    return avg_vocab_length

# Build vocabulary

In [39]:
subset = []
for i, d in enumerate(training_set):
    if (i < 1000):
        subset.append(d)
    else:
        break

In [11]:
sentences = []
for i, d in enumerate(subset):
    sentences.append(subset[i]["tokens"])
print(sentences[0])

['Alternatively', ',', 'fibroblasts', 'were', 'plated', 'sparsely', 'so', 'that', 'they', 'did', 'not', 'touch', 'each', 'other', 'and', 'induced', 'into', 'quiescence', 'by', 'serum', 'starvation', 'and', 'monitored', 'after', '4', 'd', '(', 'serum', '-', 'starved', 'for', '4', 'd', '[', 'SS4', ']', ')', 'or', '7', 'd', '(', 'serum', '-', 'starved', 'for', '7', 'd', '[', 'SS7', ']', ')', '.']


In [49]:
label_all_tokens = False

def tokenize_and_align_labels(set):
    tokenized_inputs = tokenizer(set["tokens"], truncation=True, is_split_into_words=True)
    
    labels = []
    for i, label in enumerate(set["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [51]:
training_set_tokenized = training_set.map(tokenize_and_align_labels, batched=True)
testing_set_tokenized = testing_set.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/24140 [00:00<?, ? examples/s]

Map: 100%|██████████| 24140/24140 [00:08<00:00, 2747.40 examples/s]


In [55]:
training_set_tokenized[0]

[2,
 13,
 8,
 3,
 16,
 2,
 14,
 14,
 11,
 3,
 10,
 16,
 6,
 0,
 5,
 16,
 1,
 8,
 1,
 8,
 8,
 5,
 16,
 1,
 9,
 8,
 13,
 8,
 13,
 16,
 1,
 9,
 8,
 17,
 12,
 13,
 13,
 5,
 9,
 8,
 13,
 8,
 13,
 16,
 1,
 9,
 8,
 17,
 12,
 13,
 13,
 13]

In [35]:
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# tokenizer = Tokenizer(num_words=15000, oov_token="<unk>")
# tokenizer.fit_on_texts(sentences)
# word_index = tokenizer.word_index

# sequences = tokenizer.texts_to_sequences(sentences)

# padded = pad_sequences(sequences, padding="post")

# print(word_index)
# print(sequences)
# print(padded)

[[3816, 2, 1657, 18, 3817, 2330, 1277, 24, 349, 636, 62, 1658, 85, 106, 8, 229, 111, 3818, 19, 146, 1659, 8, 3819, 87, 95, 72, 3, 146, 9, 2331, 17, 95, 72, 15, 3820, 14, 4, 27, 176, 72, 3, 146, 9, 2331, 17, 176, 72, 15, 3821, 14, 4, 6], [42, 9, 166, 74, 497, 3, 2332, 4, 8, 147, 3822, 350, 18, 220, 38, 555, 8, 637, 9, 1660, 290, 177, 3823, 17, 3824, 38, 3825, 1278, 7, 59, 3826, 3, 3827, 4, 3828, 31, 5, 3829, 8, 3830, 1661, 6], [128, 3831, 3832, 1662, 2, 3833, 3, 1012, 4, 2, 3834, 3835, 3, 3836, 4, 8, 3837, 3838, 3, 1279, 4, 2, 1280, 2333, 31, 5, 3839, 318, 7, 3840, 2, 18, 50, 10, 39, 42, 6], [739, 2334, 7, 133, 638, 1013, 2335, 3841, 27, 1281, 2, 3842, 739, 1663, 3, 2336, 4, 2, 556, 8, 2337, 3843, 8, 3844, 229, 863, 1664, 3, 3845, 4, 15, 35, 14, 6], [557, 230, 3, 3846, 4, 7, 85, 3847, 18, 351, 23, 37, 2338, 1014, 2, 352, 351, 23, 2339, 353, 18, 2340, 1015, 318, 2, 107, 2, 96, 114, 75, 3, 157, 4, 2, 97, 7, 2341, 1282, 2, 740, 2, 1283, 7, 1284, 1665, 6], [29, 2342, 3848, 13, 354, 558, 42,

In [6]:
def calculate_tf(token_count, bow):
    tf = {}
    num_bow = len(bow)
    
    for token, count in token_count.items():
        tf[token] = count / float(num_bow)
        
    return tf

def _get_tf(tokens, vocab):
    token_count = dict.fromkeys(vocab, 0)
    for token in tokens:
        token_count[token] += 1
    return calculate_tf(token_count, tokens)

tokens1 = training_set[0]["tokens"]
tokens2 = training_set[1]["tokens"]
vocab = set(tokens1).union(set(tokens2))

tf1 = _get_tf(tokens1, vocab)
tf2 = _get_tf(tokens2, vocab)

print(tf1, "\n", tf2)

{'of': 0.0, 'for': 0.038461538461538464, 'touch': 0.019230769230769232, '7': 0.038461538461538464, 'using': 0.0, 'treatment': 0.0, ']': 0.038461538461538464, '1st': 0.0, 'starved': 0.038461538461538464, 'Study': 0.0, 'controlling': 0.0, 'that': 0.019230769230769232, 'starvation': 0.019230769230769232, ')': 0.038461538461538464, 'differences': 0.0, 'SS7': 0.019230769230769232, '99th': 0.0, 'after': 0.019230769230769232, '4': 0.038461538461538464, 'mean': 0.0, 'BW': 0.0, 'IPTW': 0.0, ',': 0.019230769230769232, 'calculated': 0.0, 'regression': 0.0, 'at': 0.0, '-': 0.038461538461538464, 'did': 0.019230769230769232, 'so': 0.019230769230769232, 'ratios': 0.0, 'monitored': 0.019230769230769232, 'were': 0.019230769230769232, 'probability': 0.0, 'percentiles': 0.0, '.': 0.019230769230769232, 'they': 0.019230769230769232, 'by': 0.019230769230769232, '(': 0.038461538461538464, 'and': 0.038461538461538464, 'RRs': 0.0, 'truncated': 0.0, 'd': 0.07692307692307693, 'into': 0.019230769230769232, 'spars

In [7]:
for i, d in enumerate(training_set):
    if (i < 5):
        print(d)
    else:
        break

{'id': '0', 'tokens': ['Alternatively', ',', 'fibroblasts', 'were', 'plated', 'sparsely', 'so', 'that', 'they', 'did', 'not', 'touch', 'each', 'other', 'and', 'induced', 'into', 'quiescence', 'by', 'serum', 'starvation', 'and', 'monitored', 'after', '4', 'd', '(', 'serum', '-', 'starved', 'for', '4', 'd', '[', 'SS4', ']', ')', 'or', '7', 'd', '(', 'serum', '-', 'starved', 'for', '7', 'd', '[', 'SS7', ']', ')', '.'], 'pos_tags': [2, 13, 8, 3, 16, 2, 14, 14, 11, 3, 10, 16, 6, 0, 5, 16, 1, 8, 1, 8, 8, 5, 16, 1, 9, 8, 13, 8, 13, 16, 1, 9, 8, 17, 12, 13, 13, 5, 9, 8, 13, 8, 13, 16, 1, 9, 8, 17, 12, 13, 13, 13], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, 0, 1, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, 0, 1, 0, 0, 0]}
{'id': '1', 'tokens': ['Study', '-', 'specific', 'risk', 'ratios', '(', 'RRs', ')', 'and', 'mean', 'BW', 'differences', 'were', 'calculated', 'using', 'linear', 'and', 'log', '-', 'binomial', 'regression', 'models', 

In [8]:
import math
import threading

def calculate_idf(docs):
    N = len(docs)
    
    idf = dict.fromkeys(docs[0].keys(), 0)

    for doc in docs:
        for word, val in doc.items():
            if val > 0:
                idf[word] += 1

    for word, val in idf.items():
        idf[word] = math.log(N / float(val))
    
    return idf



In [13]:
# subset = []
# for i, d in enumerate(training_set):
#     if (i < 1000):
#         subset.append(d)
#     else:
#         break
tokens = [record["tokens"] for record in training_set]

print("Created tokens")

Created tokens


In [14]:
union_tokens = set()

for token in tokens:
    union_tokens = union_tokens.union(set(token))

print("Created union")


Created union


In [15]:

threads = []
for tokenList in tokens:
    thread = threading.Thread(target=_get_tf, args=(tokenList, union_tokens))
    threads.append(thread)
    thread.start()

tfs = [_get_tf(tokenList, union_tokens) for tokenList in tokens]
for thread in threads:
    thread.join

print("Created tfs")

MemoryError: 

In [None]:

idfs = calculate_idf(tfs)
print(idfs)



# Experiment 1 (Model)
HMM vs BERT

## HMM
The following is the implementation of an HMM model

## BERT

The following is the implementation of BERT model

### Dependencies

In [None]:
# Install dependencies
%pip install torch==1.11.0+cu113 torchdata==0.3.0 torchtext==0.12.0 -f https://download.pytorch.org/whl/cu113/torch_stable.html
%pip install ipywidgets transformers tqdm

### Tokenizer

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
from torchtext.vocab import vocab
from torchtext.data.utils import get_tokenizer

class TransformerTokenizer(torch.nn.Module):
    def __init__(self, tokenizer):
        super().__init__()
        self.tokenizer = tokenizer
    
    def forward(self, input):
        if isinstance(input, list):
            tokens = []
            for text in input:
                tokens.append(self.tokenizer.tokenize(text))
            return tokens
        elif isinstance(input, str):
            return self.tokenizer.tokenize(input)
        raise ValueError(f"Type {type(input)} is not supported.")
        
tokenizer_vocab = vocab(tokenizer.vocab, min_freq=0)

### Text Processing Pipeline

In [None]:
import torchtext.transforms as T
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

text_transform = T.Sequential(
    TransformerTokenizer(tokenizer),  # Tokenize
    T.VocabTransform(tokenizer_vocab),  # Convert to vocab IDs
    T.Truncate(max_input_length - 2),  # Cut to max length
    T.AddToken(token=tokenizer_vocab["[CLS]"], begin=True),  # BOS token
    T.AddToken(token=tokenizer_vocab["[SEP]"], begin=False),  # EOS token
    T.ToTensor(padding_value=tokenizer_vocab["[PAD]"]),  # Convert to tensor and pad
)

### Label Processing Pipeline

In [None]:
from collections import OrderedDict

label_vocab = vocab(OrderedDict([("neg", 1), ("pos", 1)]))
label_transform = T.Sequential(
    T.LabelToIndex(label_vocab.get_itos()),  # Convert to integer
    T.ToTensor(),  # Convert to tensor
)

### Build The BERT Model

In [None]:
from transformers import BertTokenizer, BertModel

bert = BertModel.from_pretrained('bert-base-uncased')
import torch.nn as nn

class BERTGRUSentiment(nn.Module):
    def __init__(self, bert, hidden_dim, output_dim, n_layers, bidirectional, dropout):        
        super().__init__()
        
        self.bert = bert
        self.embedding_dim = bert.config.to_dict()['hidden_size']
        
        if(n_layers > 2):
            dropout = dropout
        else:
            dropout = 0
        self.rnn = torch.nn.GRU(input_size=self.embedding_dim, hidden_size=hidden_dim, num_layers=n_layers, dropout=dropout, batch_first=True, bidirectional=bidirectional)
        
        self.out = torch.nn.Linear(in_features=hidden_dim * 2 if bidirectional else hidden_dim, out_features=output_dim)
        
        self.dropout = torch.nn.Dropout(dropout)
        
    def forward(self, text):

        with torch.no_grad():
            embedded = self.bert(text)[0]
        
        _, hidden = self.rnn(embedded)
                
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])

        return self.out(hidden)

### Define an Instance of the Model

In [None]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1  # We only need one neuron as output
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

model = BERTGRUSentiment(bert, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

# freeze the model
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

## Training

## Evaluation

# Experiment 2 (Hyperparameters)

# Experiment 3 (Loss Functions)

# Experiment 4 (Tokenization)