# Preparation

## Set Seed and CUDA

In [3]:
import torch
import torchtext

SEED = 1234
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

print("PyTorch Version: ", torch.__version__)
print("torchtext Version: ", torchtext.__version__)
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.")

  from .autonotebook import tqdm as notebook_tqdm


PyTorch Version:  1.11.0+cu113
torchtext Version:  0.12.0
Using GPU.


# Load Dataset

In [4]:
from datasets import load_dataset

dataset = load_dataset("surrey-nlp/PLOD-filtered")

## Data Prep

In [5]:
training_set = dataset["train"]
print(len(training_set))
validation_set = dataset["validation"]
print(len(validation_set))
testing_set = dataset["test"]
print(len(testing_set))


112652
24140
24140


In [6]:
def calculate_word_stats(doc):
    vocab_lengths = [len(word) for word in doc['tokens']]
    avg_vocab_length = sum(vocab_lengths) / len(vocab_lengths)
    return avg_vocab_length

In [7]:
from nltk import FreqDist

freqDist = FreqDist(w.lower() for w in training_set[4]['tokens'])


In [8]:
def calculate_tf(token_count, bow):
    tf = {}
    num_bow = len(bow)
    
    for token, count in token_count.items():
        tf[token] = count / float(num_bow)
        
    return tf

def _get_tf(tokens, vocab):
    token_count = dict.fromkeys(vocab, 0)
    for token in tokens:
        token_count[token] += 1
    return calculate_tf(token_count, tokens)

tokens1 = training_set[0]["tokens"]
tokens2 = training_set[1]["tokens"]
vocab = set(tokens1).union(set(tokens2))

tf1 = _get_tf(tokens1, vocab)
tf2 = _get_tf(tokens2, vocab)

print(tf1, "\n", tf2)

{'other': 0.019230769230769232, 'models': 0.0, 'log': 0.0, 'that': 0.019230769230769232, 'percentiles': 0.0, 'did': 0.019230769230769232, 'plated': 0.019230769230769232, 'by': 0.019230769230769232, 'or': 0.019230769230769232, '.': 0.019230769230769232, 'into': 0.019230769230769232, 'sparsely': 0.019230769230769232, 'of': 0.0, 'monitored': 0.019230769230769232, 'linear': 0.0, '4': 0.038461538461538464, 'SS4': 0.019230769230769232, '99th': 0.0, 'for': 0.038461538461538464, 'specific': 0.0, '(': 0.038461538461538464, 'treatment': 0.0, '7': 0.038461538461538464, 'ratios': 0.0, 'so': 0.019230769230769232, 'Alternatively': 0.019230769230769232, 'starvation': 0.019230769230769232, 'probability': 0.0, '[': 0.038461538461538464, 'regression': 0.0, 'fibroblasts': 0.019230769230769232, 'after': 0.019230769230769232, 'IPTW': 0.0, 'BW': 0.0, 'induced': 0.019230769230769232, 'mean': 0.0, 'differences': 0.0, 'RRs': 0.0, 'controlling': 0.0, 'serum': 0.057692307692307696, 'risk': 0.0, '-': 0.0384615384

In [9]:
for i, d in enumerate(training_set):
    if (i < 5):
        print(d)
    else:
        break

{'id': '0', 'tokens': ['Alternatively', ',', 'fibroblasts', 'were', 'plated', 'sparsely', 'so', 'that', 'they', 'did', 'not', 'touch', 'each', 'other', 'and', 'induced', 'into', 'quiescence', 'by', 'serum', 'starvation', 'and', 'monitored', 'after', '4', 'd', '(', 'serum', '-', 'starved', 'for', '4', 'd', '[', 'SS4', ']', ')', 'or', '7', 'd', '(', 'serum', '-', 'starved', 'for', '7', 'd', '[', 'SS7', ']', ')', '.'], 'pos_tags': [2, 13, 8, 3, 16, 2, 14, 14, 11, 3, 10, 16, 6, 0, 5, 16, 1, 8, 1, 8, 8, 5, 16, 1, 9, 8, 13, 8, 13, 16, 1, 9, 8, 17, 12, 13, 13, 5, 9, 8, 13, 8, 13, 16, 1, 9, 8, 17, 12, 13, 13, 13], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, 0, 1, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, 0, 1, 0, 0, 0]}
{'id': '1', 'tokens': ['Study', '-', 'specific', 'risk', 'ratios', '(', 'RRs', ')', 'and', 'mean', 'BW', 'differences', 'were', 'calculated', 'using', 'linear', 'and', 'log', '-', 'binomial', 'regression', 'models', 

In [1]:
import math
import threading

def calculate_idf(docs):
    N = len(docs)
    
    idf = dict.fromkeys(docs[0].keys(), 0)

    for doc in docs:
        for word, val in doc.items():
            if val > 0:
                idf[word] += 1

    for word, val in idf.items():
        idf[word] = math.log(N / float(val))
    
    return idf



In [14]:
subset = []
for i, d in enumerate(training_set):
    if (i < 1000):
        subset.append(d)
    else:
        break
tokens = [record["tokens"] for record in subset]

print("Created tokens")

Created tokens


In [15]:
union_tokens = set()

for token in tokens:
    union_tokens = union_tokens.union(set(token))

print("Created union")


Created union


In [16]:

threads = []
for tokenList in tokens:
    thread = threading.Thread(target=_get_tf, args=(tokenList, union_tokens))
    threads.append(thread)
    thread.start()

tfs = [_get_tf(tokenList, union_tokens) for tokenList in tokens]
for thread in threads:
    thread.join

print("Created tfs")

In [18]:

idfs = calculate_idf(tfs)
print(idfs)

Created tfs


# Experiment 1 (Model)
HMM vs BERT

## HMM
The following is the implementation of an HMM model

## BERT

The following is the implementation of BERT model

### Dependencies

In [None]:
# Install dependencies
%pip install torch==1.11.0+cu113 torchdata==0.3.0 torchtext==0.12.0 -f https://download.pytorch.org/whl/cu113/torch_stable.html
%pip install ipywidgets transformers tqdm

### Tokenizer

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
from torchtext.vocab import vocab
from torchtext.data.utils import get_tokenizer

class TransformerTokenizer(torch.nn.Module):
    def __init__(self, tokenizer):
        super().__init__()
        self.tokenizer = tokenizer
    
    def forward(self, input):
        if isinstance(input, list):
            tokens = []
            for text in input:
                tokens.append(self.tokenizer.tokenize(text))
            return tokens
        elif isinstance(input, str):
            return self.tokenizer.tokenize(input)
        raise ValueError(f"Type {type(input)} is not supported.")
        
tokenizer_vocab = vocab(tokenizer.vocab, min_freq=0)

### Text Processing Pipeline

In [None]:
import torchtext.transforms as T
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

text_transform = T.Sequential(
    TransformerTokenizer(tokenizer),  # Tokenize
    T.VocabTransform(tokenizer_vocab),  # Convert to vocab IDs
    T.Truncate(max_input_length - 2),  # Cut to max length
    T.AddToken(token=tokenizer_vocab["[CLS]"], begin=True),  # BOS token
    T.AddToken(token=tokenizer_vocab["[SEP]"], begin=False),  # EOS token
    T.ToTensor(padding_value=tokenizer_vocab["[PAD]"]),  # Convert to tensor and pad
)

### Label Processing Pipeline

In [None]:
from collections import OrderedDict

label_vocab = vocab(OrderedDict([("neg", 1), ("pos", 1)]))
label_transform = T.Sequential(
    T.LabelToIndex(label_vocab.get_itos()),  # Convert to integer
    T.ToTensor(),  # Convert to tensor
)

### Build The BERT Model

In [None]:
from transformers import BertTokenizer, BertModel

bert = BertModel.from_pretrained('bert-base-uncased')
import torch.nn as nn

class BERTGRUSentiment(nn.Module):
    def __init__(self, bert, hidden_dim, output_dim, n_layers, bidirectional, dropout):        
        super().__init__()
        
        self.bert = bert
        self.embedding_dim = bert.config.to_dict()['hidden_size']
        
        if(n_layers > 2):
            dropout = dropout
        else:
            dropout = 0
        self.rnn = torch.nn.GRU(input_size=self.embedding_dim, hidden_size=hidden_dim, num_layers=n_layers, dropout=dropout, batch_first=True, bidirectional=bidirectional)
        
        self.out = torch.nn.Linear(in_features=hidden_dim * 2 if bidirectional else hidden_dim, out_features=output_dim)
        
        self.dropout = torch.nn.Dropout(dropout)
        
    def forward(self, text):

        with torch.no_grad():
            embedded = self.bert(text)[0]
        
        _, hidden = self.rnn(embedded)
                
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])

        return self.out(hidden)

### Define an Instance of the Model

In [None]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1  # We only need one neuron as output
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

model = BERTGRUSentiment(bert, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

# freeze the model
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

## Training

## Evaluation

# Experiment 2 (Hyperparameters)

# Experiment 3 (Loss Functions)

# Experiment 4 (Tokenization)