In [4]:
import torch
import numpy as np
import pickle
import os
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from tqdm import tqdm

### Load BERT Model

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [5]:
train_context = pd.read_excel('./data/train/train_context.xlsx')
train_context.head()

Unnamed: 0,context,contextID
0,North Carolina consists of three main geograph...,1
1,The coastal plain transitions to the Piedmont ...,2
2,The western section of the state is part of th...,3
3,The climate of the coastal plain is influenced...,4
4,The Atlantic Ocean has less influence on the c...,5


In [6]:
context_para = train_context['context']
context_para[1]

"The coastal plain transitions to the Piedmont region along the Atlantic Seaboard fall line, a line which marks the elevation at which waterfalls first appear on streams and rivers. The Piedmont region of central North Carolina is the state's most urbanized and densely populated section. It consists of gently rolling countryside frequently broken by hills or low mountain ridges. Small, isolated, and deeply eroded mountain ranges and peaks are located in the Piedmont, including the Sauratown Mountains, Pilot Mountain, the Uwharrie Mountains, Crowder's Mountain, King's Pinnacle, the Brushy Mountains, and the South Mountains. The Piedmont ranges from about 300 to 400 feet (91 to 122 m) in elevation in the east to over 1,000 feet (300 m) in the west. Because of the rapid population growth in the Piedmont, a significant part of the rural area in this region is being transformed into suburbs with shopping centers, housing, and corporate offices. Agriculture is steadily declining in importanc

### Train and save BERT embeddings done on SQuAD "Train context"

In [5]:
embedding_size = 100

try:
    bert_embeddings = pd.read_pickle('./context/content_embed_{}.pkl'.format(embedding_size))

except:
    ### Build embeddding
    bert_embeddings = {}

    with torch.no_grad():
        with tf.device('/gpu:1'):
            for para in tqdm(context_para):
                words = para.split()
                print()
                for word in words:
                    if word in bert_embeddings.keys():
                        continue
                    else:
                        input_ids = tf.constant(tokenizer.encode(word))[None, :]  # Batch size 1
                        outputs = model(input_ids)
                        last_hidden_states = outputs[-1]  # The last hidden-state is the first element of the output tuple
                        bert_embeddings[word] = last_hidden_states[0][:embedding_size]
                        
    ## Saving BERT Embeddings
    with open("./context/content_embed_{}.pkl".format(embedding_size), "wb") as e:
        pickle.dump(bert_embeddings, e)

In [9]:
input_ids = tf.constant(tokenizer.encode("ember is bad"))[None, :]  # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[-1]  # The last hidden-state is the first element of the output tuple
hazel = last_hidden_states[0]

In [8]:
bidaf_embedding= pd.read_pickle('./data/train/no_random_word_embeddings.pkl')
bidaf_word2idx = pd.read_pickle('./data/train/word2idx.pkl')

### Merge Embedding
outOfVocab : Refers to words that are not found in the GloVe corpus, thus were randomly intialized in the original BiDAF embeddings, however this words were found and replaced by the embeddings done in BERT <br>

not_found: This refers to words not found in both the BiDAF and BERT embeddings, thus we had to revert back to random initialised vectors for this words. From further inspections these are words that are misspelled <br>

matched : This refers to words that appeared in both the BiDAF as well as the BERT embeddings, thus we just concatanate the embeddings<br>

bidaf_ed : This refers to words only found in the bidaf model thus to ensure that the word embedding tensor is of the same size, we duplicate the tensor and concatanate it with itself


In [9]:
outOfVocab = 0
not_found = 0
matched = 0
bidaf_ed = 0

words_not_found = []
for key in bidaf_word2idx:
    index = bidaf_word2idx[key]

    if np.all(bidaf_embedding[index] == 1):
        try:
            bidaf_embedding[index] = np.concatenate([bert_embeddings[key],bert_embeddings[key]])
            outOfVocab += 1
        except:
            bidaf_embedding[index] = np.random.normal(0, 0.1, 200)
            print(key)
            not_found += 1
            words_not_found.append(key)
    else:
        try:
            bidaf_embedding[index] = np.concatenate([bidaf_embedding[index],bert_embeddings[key]])
            matched += 1

        except:
            bidaf_embedding[index] = np.concatenate([bidaf_embedding[index],bidaf_embedding[index]])
            bidaf_ed += 1
            print(key)
            words_not_found.append(key)

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [10]:
print(outOfVocab)
print(not_found)
print(matched)
print(bidaf_ed)

33518
25960
32361
10782


In [11]:
#Save the embedding
with open("./data/train/combined_word_embeddings.pkl", "wb") as e:
    pickle.dump(bidaf_embedding, e)

In [12]:
#Save words that cant be found in the BERT Embedding
with open("./data/train/words_not_found.pkl", "wb") as e:
    pickle.dump(words_not_found, e)

## Experiment with adding ELMO embedding since BERT did not have much help



In [7]:
import tensorflow_hub as hub

import tensorflow.compat.v1 as tf
tf.disable_eager_execution() ## Disable eager execution for graph creation

elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

In [8]:
# just a random sentence
x = ["Roasted"]

def elmo_vectors(x):
  embeddings = elmo(x, signature="default", as_dict=True)["elmo"]

  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    # return average of ELMo features
    return sess.run(tf.reduce_mean(embeddings,0))

In [9]:
y = elmo_vectors(x)
y[0][:10]

I0407 01:58:24.775568 16516 saver.py:1511] Saver not created because there are no variables in the graph to restore


array([ 0.11338134, -0.22380908,  0.20500736,  0.06183614,  0.41078082,
       -0.2128304 , -0.15933694,  0.40490472,  0.13175496,  0.31573987],
      dtype=float32)

In [21]:
embedding_size = 100

try:
    elmo_embeddings = pd.read_pickle('./context/elmo_content_embed_{}.pkl'.format(embedding_size))

except:
    ### Build embeddding
    elmo_embeddings = {}
   
    for para in tqdm(context_para):
        words = para.split()
        for word in words:
            if word in elmo_embeddings.keys():
                continue
            else:
                temp = elmo_vectors(x)
                elmo_embeddings[word] = temp[0][:embedding_size]
                  
    ## Saving BERT Embeddings
    with open("./context/elmo_content_embed_{}.pkl".format(embedding_size), "wb") as e:
        pickle.dump(elmo_embeddings, e)

  0%|          | 0/14798 [00:00<?, ?it/s]I0407 01:53:28.033949 23384 saver.py:1511] Saver not created because there are no variables in the graph to restore
  0%|          | 0/14798 [00:05<?, ?it/s]


InvalidArgumentError: Cannot colocate nodes node module_1_apply_default_14/map/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3 (defined at C:\Users\ngtze\anaconda3\lib\site-packages\tensorflow_hub\native_module.py:615) placed on device No device assignments were active during op 'module_1_apply_default_14/map/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3' creation.  and node module_1_apply_default_14/Reshape (defined at C:\Users\ngtze\anaconda3\lib\site-packages\tensorflow_hub\native_module.py:615) placed on device Device assignments active during op 'module_1_apply_default_14/Reshape' creation:
  with tf.device(/device:CPU:0): <C:\Users\ngtze\anaconda3\lib\site-packages\tensorflow\python\framework\importer.py:80>
  with tf.device(/gpu:1): <<ipython-input-19-88410196d8f3>:10> : Cannot merge devices with incompatible types: '/device:GPU:1' and '/device:CPU:0'
	 [[node module_1_apply_default_14/map/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3 (defined at C:\Users\ngtze\anaconda3\lib\site-packages\tensorflow_hub\native_module.py:615) ]]Additional information about colocations:No node-device colocations were active during op 'module_1_apply_default_14/map/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3' creation.
No device assignments were active during op 'module_1_apply_default_14/map/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3' creation.
No node-device colocations were active during op 'module_1_apply_default_14/Reshape' creation.
Device assignments active during op 'module_1_apply_default_14/Reshape' creation:
  with tf.device(/device:CPU:0): <C:\Users\ngtze\anaconda3\lib\site-packages\tensorflow\python\framework\importer.py:80>
  with tf.device(/gpu:1): <<ipython-input-19-88410196d8f3>:10>

Original stack trace for 'module_1_apply_default_14/map/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3':
  File "c:\Users\ngtze\.vscode\extensions\ms-toolsai.jupyter-2021.5.702919634\pythonFiles\vscode_datascience_helpers\..\pyvsc-run-isolated.py", line 30, in <module>
    runpy.run_path(module, run_name="__main__")
  File "C:\Users\ngtze\anaconda3\lib\runpy.py", line 265, in run_path
    return _run_module_code(code, init_globals, run_name,
  File "C:\Users\ngtze\anaconda3\lib\runpy.py", line 97, in _run_module_code
    _run_code(code, mod_globals, init_globals,
  File "C:\Users\ngtze\anaconda3\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "c:\Users\ngtze\.vscode\extensions\ms-toolsai.jupyter-2021.5.702919634\pythonFiles\vscode_datascience_helpers\kernel_prewarm_starter.py", line 31, in <module>
    runpy.run_module(module, run_name="__main__", alter_sys=False)
  File "C:\Users\ngtze\anaconda3\lib\runpy.py", line 210, in run_module
    return _run_code(code, {}, init_globals, run_name, mod_spec)
  File "C:\Users\ngtze\anaconda3\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\ngtze\anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\ngtze\anaconda3\lib\site-packages\traitlets\config\application.py", line 845, in launch_instance
    app.start()
  File "C:\Users\ngtze\anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 612, in start
    self.io_loop.start()
  File "C:\Users\ngtze\anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 149, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\ngtze\anaconda3\lib\asyncio\base_events.py", line 570, in run_forever
    self._run_once()
  File "C:\Users\ngtze\anaconda3\lib\asyncio\base_events.py", line 1859, in _run_once
    handle._run()
  File "C:\Users\ngtze\anaconda3\lib\asyncio\events.py", line 81, in _run
    self._context.run(self._callback, *self._args)
  File "C:\Users\ngtze\anaconda3\lib\site-packages\tornado\ioloop.py", line 690, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "C:\Users\ngtze\anaconda3\lib\site-packages\tornado\ioloop.py", line 743, in _run_callback
    ret = callback()
  File "C:\Users\ngtze\anaconda3\lib\site-packages\tornado\gen.py", line 787, in inner
    self.run()
  File "C:\Users\ngtze\anaconda3\lib\site-packages\tornado\gen.py", line 748, in run
    yielded = self.gen.send(value)
  File "C:\Users\ngtze\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 365, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "C:\Users\ngtze\anaconda3\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "C:\Users\ngtze\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 268, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "C:\Users\ngtze\anaconda3\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "C:\Users\ngtze\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 543, in execute_request
    self.do_execute(
  File "C:\Users\ngtze\anaconda3\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "C:\Users\ngtze\anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 306, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\ngtze\anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\ngtze\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2877, in run_cell
    result = self._run_cell(
  File "C:\Users\ngtze\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2923, in _run_cell
    return runner(coro)
  File "C:\Users\ngtze\anaconda3\lib\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
    coro.send(None)
  File "C:\Users\ngtze\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3146, in run_cell_async
    has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  File "C:\Users\ngtze\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3338, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "C:\Users\ngtze\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-19-88410196d8f3>", line 17, in <module>
    temp = elmo_vectors(x)
  File "<ipython-input-14-35d5d78b2e58>", line 5, in elmo_vectors
    embeddings = elmo(x, signature="default", as_dict=True)["elmo"]
  File "C:\Users\ngtze\anaconda3\lib\site-packages\tensorflow_hub\module.py", line 266, in __call__
    dict_outputs = self._impl.create_apply_graph(
  File "C:\Users\ngtze\anaconda3\lib\site-packages\tensorflow_hub\native_module.py", line 615, in create_apply_graph
    tf.compat.v1.train.import_meta_graph(
  File "C:\Users\ngtze\anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 1459, in import_meta_graph
    return _import_meta_graph_with_return_elements(meta_graph_or_file,
  File "C:\Users\ngtze\anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 1480, in _import_meta_graph_with_return_elements
    meta_graph.import_scoped_meta_graph_with_return_elements(
  File "C:\Users\ngtze\anaconda3\lib\site-packages\tensorflow\python\framework\meta_graph.py", line 799, in import_scoped_meta_graph_with_return_elements
    imported_return_elements = importer.import_graph_def(
  File "C:\Users\ngtze\anaconda3\lib\site-packages\tensorflow\python\util\deprecation.py", line 538, in new_func
    return func(*args, **kwargs)
  File "C:\Users\ngtze\anaconda3\lib\site-packages\tensorflow\python\framework\importer.py", line 400, in import_graph_def
    return _import_graph_def_internal(
  File "C:\Users\ngtze\anaconda3\lib\site-packages\tensorflow\python\framework\importer.py", line 513, in _import_graph_def_internal
    _ProcessNewOps(graph)
  File "C:\Users\ngtze\anaconda3\lib\site-packages\tensorflow\python\framework\importer.py", line 243, in _ProcessNewOps
    for new_op in graph._add_new_tf_operations(compute_devices=False):  # pylint: disable=protected-access
  File "C:\Users\ngtze\anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 3678, in _add_new_tf_operations
    new_ops = [
  File "C:\Users\ngtze\anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 3679, in <listcomp>
    self._create_op_from_tf_operation(c_op, compute_device=compute_devices)
  File "C:\Users\ngtze\anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 3561, in _create_op_from_tf_operation
    ret = Operation(c_op, self)
  File "C:\Users\ngtze\anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1990, in __init__
    self._traceback = tf_stack.extract_stack()
