# BERT embedding creation for Research Project 32934 on Google Colab
* This note book was used to create BERT embeddings for the datasets.

* It can be used without a mounted google drive however its not recommended.

* The library used for getting the BERT embeddings is called Bert-as-a-service. It takes a saved BERT model and starts up an optimized encoding server which can be used to encode any number of sentences.

* The saved weights used for this can be downloaded from https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
It is recommended to store these on Google Drive for persistent storage.

* The datasets can also be stored on google drive and the Paths in the cell below can be changed accordingly.

* The code below reads the JSON file and loads it in a pandas dataframe.
At the end, the entire train and test dataframes are pickled and saved at the path specified.

* A max sequence length of 256 was chosen as anything larger than that would not give results (computational limitations?).

**NOTE:** This notebook requires a different tensorflow version than the rest of the source code. This was part of the reason why its run on colab.


In [0]:
PATH_TO_TRAIN_SET = '/content/drive/My Drive/temp_datasets/combined-train.json'
PATH_TO_TEST_SET = '/content/drive/My Drive/temp_datasets/combined-test.json'

PATH_TO_SAVE_TRAIN_EMBEDS = '/content/drive/My Drive/temp_datasets/clpsych16_train_bert_embeds.pkl'
PATH_TO_SAVE_TEST_EMBEDS = '/content/drive/My Drive/temp_datasets/clpsych16_test_bert_embeds.pkl'

PATH_TO_SAVED_BERT = '/content/drive/My Drive/temp_datasets/uncased_L-12_H-768_A-12'
# PATH_TO_SAVED_BERT = '/content/drive/My Drive/temp_datasets/uncased_L-24_H-1024_A-16'

MAX_LEN = 256

In [0]:
!nvidia-smi

Tue Jun  9 20:28:40 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8    27W / 149W |      0MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
!nohup pip install bert-serving-client tensorflow-gpu==1.15.0 transformers
!nohup pip install -U bert-serving-server[http]

nohup: ignoring input and appending output to 'nohup.out'
nohup: ignoring input and appending output to 'nohup.out'


In [0]:
import json
from bs4 import BeautifulSoup
from pprint import pprint
import pandas as pd
import re


def clean_text(raw_text: str):
    if raw_text is None:
        return ''

    soup = BeautifulSoup(raw_text, features="html.parser")
    raw_text = soup.get_text()
    raw_text = raw_text.replace('\n', ' ').replace('\xa0', ' ')
    return raw_text


def read_json_as_df(path: str) -> pd.DataFrame:
    json_data = []

    with open(path, 'r', encoding='utf-8') as file:

        for line in file:
            data = json.loads(line)
            json_data.append([clean_text(data['post'].get('body', None)),
                              data['priority']])

    df = pd.DataFrame(data=json_data, columns=('text', 'priority'))

    return df

train_df = read_json_as_df(PATH_TO_TRAIN_SET)
test_df = read_json_as_df(PATH_TO_TEST_SET)

In [0]:
import transformers
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased',
                                                             do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [0]:
import numpy as np
from tqdm import tqdm

train_df['tokenized_len'] = np.array([len(tokenizer.tokenize(sent)) for sent in train_df.text])
test_df['tokenized_len'] = np.array([len(tokenizer.tokenize(sent)) for sent in test_df.text])

In [0]:
max(train_df.tokenized_len)

1793

In [0]:
!nohup bert-serving-start -pooling_layer -4 -3 -2 -model_dir="{PATH_TO_SAVED_BERT}" -max_seq_len={MAX_LEN} > out.txt 2>&1 &

In [0]:
from bert_serving.client import BertClient
bc = BertClient()

In [0]:
train_df.text = train_df.text.str.strip()
train_df = train_df[train_df.text != '']

# test_df.text = test_df.text.str.strip()
test_df = test_df[test_df.text != '']

In [0]:
def get_chunk_embeds(sents):
    embeddings = bc.encode(sents)
    return np.mean(embeddings, axis=0)

def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def split_sentences(sents, LEN):
    sents = tokenizer.tokenize(sents)
    sents = list(chunks(sents, LEN))
    return [' '.join(sent) for sent in sents]

In [0]:
%%time

short_train_df = train_df[train_df.tokenized_len <= MAX_LEN]
short_train_embeds = bc.encode(short_train_df.text.values.tolist())
short_train_df.insert(loc=0, value=[embed for embed in short_train_embeds], column='embeds')


long_train_df = train_df[train_df.tokenized_len > MAX_LEN]
long_train_sents = [split_sentences(sent, MAX_LEN) for sent in long_train_df.text.values]
long_train_embeds = [get_chunk_embeds(sents) for sents in tqdm(long_train_sents)]
long_train_df.insert(loc=0, value=long_train_embeds, column='embeds')

short_test_df = test_df[test_df.tokenized_len <= MAX_LEN]
short_test_embeds = bc.encode(short_test_df.text.values.tolist())
short_test_df.insert(loc=0, value=[embed for embed in short_test_embeds], column='embeds')

long_test_df = test_df[test_df.tokenized_len > MAX_LEN]
long_test_sents = [split_sentences(sent, MAX_LEN) for sent in long_test_df.text.values]
long_test_embeds = [get_chunk_embeds(sents) for sents in tqdm(long_test_sents)]
long_test_df.insert(loc=0, value=long_test_embeds, column='embeds')


In [0]:
new_train_df = pd.concat([short_train_df, long_train_df])
new_test_df = pd.concat([short_test_df, long_test_df])

In [0]:
assert new_train_df[new_train_df.embeds.isnull()].values.tolist() is not None
assert new_test_df[new_test_df.embeds.isnull()].values.tolist() is not None

assert new_train_df.shape[0] == train_df.shape[0]
assert new_test_df.shape[0] == test_df.shape[0]

In [0]:
new_train_df.embeds[0].shape

(2304,)

In [0]:
import pickle
pickle.dump(new_train_df, open(PATH_TO_SAVE_TRAIN_EMBEDS, 'wb'))
pickle.dump(new_test_df, open(PATH_TO_SAVE_TEST_EMBEDS, 'wb'))