# Generate ELMo embedding for Research Project 32934

This notebook should be used after mounting a google drive.

The paths to the datasets and save location can be specified in the cell below.

This notebook saves the entire train and test dataframe as pickle objects.

In [0]:
PATH_TO_TRAIN_SET = '/content/drive/My Drive/temp_datasets/clpsych16-train.json'
PATH_TO_TEST_SET = '/content/drive/My Drive/temp_datasets/clpsych16-test.json'

PATH_TO_SAVE_TRAIN_EMBEDS = '/content/drive/My Drive/temp_datasets/clpsych16_train_elmo_embeds.pkl'
PATH_TO_SAVE_TEST_EMBEDS = '/content/drive/My Drive/temp_datasets/clpsych16_test_elmo_embeds.pkl'

In [0]:
!nvidia-smi

Tue Jun  9 04:54:27 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
!nohup pip install tensorflow-gpu==1.15.0 

nohup: ignoring input and appending output to 'nohup.out'


In [0]:
import tensorflow_hub as hub
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)

In [0]:
model = hub.Module("https://tfhub.dev/google/elmo/3", trainable=True)

from tqdm import tqdm

def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [0]:
import json
from bs4 import BeautifulSoup
from pprint import pprint
import pandas as pd
import re


def clean_text(raw_text: str):
    if raw_text is None:
        return ''

    soup = BeautifulSoup(raw_text, features="html.parser")
    raw_text = soup.get_text()
    raw_text = raw_text.replace('\n', ' ').replace('\xa0', ' ')
    return raw_text


def read_json_as_df(path: str) -> pd.DataFrame:
    json_data = []

    with open(path, 'r', encoding='utf-8') as file:

        for line in file:
            data = json.loads(line)
            json_data.append([clean_text(data['post'].get('body', None)),
                              data['priority']])

    df = pd.DataFrame(data=json_data, columns=('text', 'priority'))

    return df

train_df = read_json_as_df(PATH_TO_TRAIN_SET)
test_df = read_json_as_df(PATH_TO_TEST_SET)

In [0]:
train_df.text = train_df.text.str.strip()
train_df = train_df[train_df.text != '']

# test_df.text = test_df.text.str.strip()
test_df = test_df[test_df.text != '']

In [0]:
def get_elmo_vectors(x):
  embeddings = model(x.tolist(), signature="default", as_dict=True)["elmo"]

  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    # return average of ELMo features
    return sess.run(tf.reduce_mean(embeddings,1))

In [0]:
list_train = [train_df.text.values[i:i+20] for i in range(0,train_df.text.shape[0],20)]
list_test = [test_df.text.values[i:i+20] for i in range(0,test_df.text.shape[0],20)]

In [0]:
elmo_train = [get_elmo_vectors(x) for x in tqdm(list_train)]

100%|██████████| 48/48 [05:14<00:00,  6.55s/it]


In [0]:
elmo_test = [get_elmo_vectors(x) for x in tqdm(list_test)]

100%|██████████| 12/12 [02:23<00:00, 11.97s/it]


In [0]:
import numpy as np
elmo_train_new = np.concatenate(elmo_train, axis = 0)
elmo_test_new = np.concatenate(elmo_test, axis = 0)

In [0]:
assert train_df.shape[0] == elmo_train_new.shape[0]
assert test_df.shape[0] == elmo_test_new.shape[0]

In [0]:
train_df.insert(loc=0, value=[embed for embed in elmo_train_new], column='embeds')
test_df.insert(loc=0, value=[embed for embed in elmo_test_new], column='embeds')

In [0]:
import pickle

pickle.dump(train_df, open(PATH_TO_SAVE_TRAIN_EMBEDS, 'wb'))
pickle.dump(test_df, open(PATH_TO_SAVE_TEST_EMBEDS, 'wb'))