In [4]:
import torch
import numpy as np
import pickle
import os
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from tqdm import tqdm

### Load BERT Model

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [5]:
train_context = pd.read_excel('./data/train/train_context.xlsx')
train_context.head()

Unnamed: 0,context,contextID
0,North Carolina consists of three main geograph...,1
1,The coastal plain transitions to the Piedmont ...,2
2,The western section of the state is part of th...,3
3,The climate of the coastal plain is influenced...,4
4,The Atlantic Ocean has less influence on the c...,5


In [6]:
context_para = train_context['context']
context_para[1]

"The coastal plain transitions to the Piedmont region along the Atlantic Seaboard fall line, a line which marks the elevation at which waterfalls first appear on streams and rivers. The Piedmont region of central North Carolina is the state's most urbanized and densely populated section. It consists of gently rolling countryside frequently broken by hills or low mountain ridges. Small, isolated, and deeply eroded mountain ranges and peaks are located in the Piedmont, including the Sauratown Mountains, Pilot Mountain, the Uwharrie Mountains, Crowder's Mountain, King's Pinnacle, the Brushy Mountains, and the South Mountains. The Piedmont ranges from about 300 to 400 feet (91 to 122 m) in elevation in the east to over 1,000 feet (300 m) in the west. Because of the rapid population growth in the Piedmont, a significant part of the rural area in this region is being transformed into suburbs with shopping centers, housing, and corporate offices. Agriculture is steadily declining in importanc

### Train and save BERT embeddings done on SQuAD "Train context"

In [5]:
embedding_size = 100

try:
    bert_embeddings = pd.read_pickle('./context/content_embed_{}.pkl'.format(embedding_size))

except:
    ### Build embeddding
    bert_embeddings = {}

    with torch.no_grad():
        with tf.device('/gpu:1'):
            for para in tqdm(context_para):
                words = para.split()
                print()
                for word in words:
                    if word in bert_embeddings.keys():
                        continue
                    else:
                        input_ids = tf.constant(tokenizer.encode(word))[None, :]  # Batch size 1
                        outputs = model(input_ids)
                        last_hidden_states = outputs[-1]  # The last hidden-state is the first element of the output tuple
                        bert_embeddings[word] = last_hidden_states[0][:embedding_size]
                        
    ## Saving BERT Embeddings
    with open("./context/content_embed_{}.pkl".format(embedding_size), "wb") as e:
        pickle.dump(bert_embeddings, e)

In [9]:
input_ids = tf.constant(tokenizer.encode("ember is bad"))[None, :]  # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[-1]  # The last hidden-state is the first element of the output tuple
hazel = last_hidden_states[0]

In [8]:
bidaf_embedding= pd.read_pickle('./data/train/no_random_word_embeddings.pkl')
bidaf_word2idx = pd.read_pickle('./data/train/word2idx.pkl')

### Merge Embedding
outOfVocab : Refers to words that are not found in the GloVe corpus, thus were randomly intialized in the original BiDAF embeddings, however this words were found and replaced by the embeddings done in BERT <br>

not_found: This refers to words not found in both the BiDAF and BERT embeddings, thus we had to revert back to random initialised vectors for this words. From further inspections these are words that are misspelled <br>

matched : This refers to words that appeared in both the BiDAF as well as the BERT embeddings, thus we just concatanate the embeddings<br>

bidaf_ed : This refers to words only found in the bidaf model thus to ensure that the word embedding tensor is of the same size, we duplicate the tensor and concatanate it with itself


In [9]:
outOfVocab = 0
not_found = 0
matched = 0
bidaf_ed = 0

words_not_found = []
for key in bidaf_word2idx:
    index = bidaf_word2idx[key]

    if np.all(bidaf_embedding[index] == 1):
        try:
            bidaf_embedding[index] = np.concatenate([bert_embeddings[key],bert_embeddings[key]])
            outOfVocab += 1
        except:
            bidaf_embedding[index] = np.random.normal(0, 0.1, 200)
            print(key)
            not_found += 1
            words_not_found.append(key)
    else:
        try:
            bidaf_embedding[index] = np.concatenate([bidaf_embedding[index],bert_embeddings[key]])
            matched += 1

        except:
            bidaf_embedding[index] = np.concatenate([bidaf_embedding[index],bidaf_embedding[index]])
            bidaf_ed += 1
            print(key)
            words_not_found.append(key)

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [10]:
print(outOfVocab)
print(not_found)
print(matched)
print(bidaf_ed)

33518
25960
32361
10782


In [11]:
#Save the embedding
with open("./data/train/combined_word_embeddings.pkl", "wb") as e:
    pickle.dump(bidaf_embedding, e)

In [12]:
#Save words that cant be found in the BERT Embedding
with open("./data/train/words_not_found.pkl", "wb") as e:
    pickle.dump(words_not_found, e)

## Experiment with adding ELMO embedding since BERT did not have much help



In [7]:
import tensorflow_hub as hub

import tensorflow.compat.v1 as tf
tf.disable_eager_execution() ## Disable eager execution for graph creation

elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

In [8]:
# just a random sentence
x = ["Roasted"]

def elmo_vectors(x):
  embeddings = elmo(x, signature="default", as_dict=True)["elmo"]

  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    # return average of ELMo features
    return sess.run(tf.reduce_mean(embeddings,0))

In [9]:
y = elmo_vectors(x)
y[0][:10]

I0407 01:58:24.775568 16516 saver.py:1511] Saver not created because there are no variables in the graph to restore


array([ 0.11338134, -0.22380908,  0.20500736,  0.06183614,  0.41078082,
       -0.2128304 , -0.15933694,  0.40490472,  0.13175496,  0.31573987],
      dtype=float32)

In [None]:
embedding_size = 100

try:
    elmo_embeddings = pd.read_pickle('./context/elmo_content_embed_{}.pkl'.format(embedding_size))

except:
    ### Build embeddding
    elmo_embeddings = {}
   
    for para in tqdm(context_para):
        words = para.split()
        for word in words:
            if word in elmo_embeddings.keys():
                continue
            else:
                temp = elmo_vectors(x)
                elmo_embeddings[word] = temp[0][:embedding_size]
                  
    ## Saving BERT Embeddings
    with open("./context/elmo_content_embed_{}.pkl".format(embedding_size), "wb") as e:
        pickle.dump(elmo_embeddings, e)