The goal of this notebook is to use a pretrained BERT model to create embeddings of the earnings call transcripts that we have parsed and stored in the transcript data structure discussed in the FormTranscripts notebook.

In [163]:
import numpy as np
import pandas as pd
import pickle
from nltk.tokenize import word_tokenize
from bert_serving.client import BertClient
from tqdm import tqdm

In [164]:
with open('data/transcripts.pickle', 'rb') as f:
    transcripts = pickle.load(f)
with open('data/id_to_row.pickle', 'rb') as f:
    id_to_row = pickle.load(f)

In [165]:
# Instantiate a BertClient object
bc = BertClient()

We'll begin with a toy example

In [166]:
result = bc.encode(["Hello, world.", "This is a test.", "One encoding per sentence."])
print(result.shape)
print(result[0])

(3, 7, 768)
[[-0.6583949   0.00830614  0.03765348 ... -0.45243877 -0.00460901
   0.31208587]
 [ 0.03465232  0.61453384  0.73173016 ... -0.93620336 -0.38065237
  -0.26721236]
 [-1.3928384   0.4704191   1.006837   ... -0.9887997   0.00509461
   0.5722854 ]
 ...
 [-1.1544309  -0.2005329   0.15407975 ...  0.07570094  0.13651484
  -0.60283816]
 [ 0.04261789  0.01326546 -0.02783578 ...  0.00655589 -0.04553343
   0.0079642 ]
 [-0.          0.          0.         ... -0.         -0.
   0.        ]]




In [167]:
# We'll use max-pooling across all 768 dimensions to get a single 768-dim representation for each sentence
result = np.max(result, axis=1)
print(result.shape)

(3, 768)


We now need to do some preprocessing. Specifically, we need to chunk our question and answer text just as we did with the statements (see the FormTranscripts notebook).

In [188]:
CHUNK_SZ = 64

In [189]:
def create_chunks(tokens):
    '''
    Form a list of strings with at most CHUNK_SZ words each
    '''
    result = []
    for i in range(0, len(tokens), CHUNK_SZ):
        offset = min(CHUNK_SZ, len(tokens) - i)
        curr_chunk = tokens[i:i + offset]
        curr_str = ' '.join(curr_chunk)
        result.append(curr_str)
    return result

In [190]:
for i in range(len(transcripts)):
    curr_qna = transcripts[i][3]
    for idx, elem in enumerate(curr_qna):
        curr_tokens = word_tokenize(elem[0])
        transcripts[i][3][idx] = (create_chunks(curr_tokens), elem[1])

Now we're ready to encode our training data! We will create a list transcript_embeddings of the form [ [statement chunk 1 embedding, statement chunk 2 embedding, ...], [([Q1 chunk 1 embedding, Q1 chunk 2 embedding, ...], 0), ([A1 chunk 1 embedding, A1 chunk 2 embedding, ...], 1)] ]

In [191]:
def embed_statement_chunks(chunks):
    embeddings = bc.encode(chunks)
    embeddings = np.max(embeddings, axis=1)
    return embeddings

In [192]:
def embed_questions_and_answers(qna):
    result = []
    for elem in qna:
        sents = [chunk for chunk in elem[0]]
        embeddings = bc.encode(sents)
        embeddings = np.max(embeddings, axis=1)
        embedding = embeddings.max(axis=0)  # Just want one embedding per question or answer
        result.append((embedding, elem[1]))
    return result

In [200]:
NUM_SAMPLES = 1

In [201]:
transcript_embeddings = []
for i in tqdm(range(NUM_SAMPLES)):
    statement_embeddings = embed_statement_chunks(transcripts[i][2])
    qna_embeddings = embed_questions_and_answers(transcripts[i][3])
    curr_embeddings = [statement_embeddings, qna_embeddings]
    transcript_embeddings.append(curr_embeddings)





  0%|          | 0/1 [00:00<?, ?it/s][A[A[A[A



100%|██████████| 1/1 [00:05<00:00,  5.44s/it][A[A[A[A

Now let's see how well we can do at contextualizing questions using these pretrained BERT embeddings

In [202]:
def cosine_sim(u, v):
    u = np.reshape(u, (768,))
    v = np.reshape(v, (768,))
    return np.dot(u.T, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [203]:
q_to_chunk = []
for i in range(NUM_SAMPLES):
    curr = {}
    for q_a_idx in range(len(transcript_embeddings[i][1])):
        if transcript_embeddings[i][1][q_a_idx][1] == 1: continue  # This is an answer
        q_embedding = transcript_embeddings[i][1][q_a_idx][0]
        
        curr_best_score, curr_best_chunk = None, None
        for chunk_idx in range(transcript_embeddings[i][0].shape[0]):
            chunk_embedding = transcript_embeddings[i][0][chunk_idx]
            score = cosine_sim(q_embedding, chunk_embedding)
            if curr_best_score is None or score > curr_best_score:
                curr_best_score = score
                curr_best_chunk = chunk_idx
        curr[q_a_idx] = curr_best_chunk
    q_to_chunk.append(curr)

In [204]:
for i, mapping in enumerate(q_to_chunk):
    for q_idx in mapping.keys():
        print ("QUESTION TEXT:\n")
        print(transcripts[i][3][q_idx][0])
        print("\nANSWER TEXT:\n")
        print(transcripts[i][2][mapping[q_idx]] + '\n')
        print('#' * 75)

QUESTION TEXT:

['operator instructions first question comes line bruce geller dghm please proceed']

ANSWER TEXT:

thank operator good morning thank joining us conference call fourth fiscal quarter full year ended october 1 2016 call today michael weinstein chairman ceo vinny pascal chief operating officer yet obtained copy press release issued newswire yesterday available website review full text press release along associated financial tables please go homepage begin however like read safe harbor statement need remind everyone part discussion afternoon

###########################################################################
QUESTION TEXT:

['hi good morning guys']

ANSWER TEXT:

greetings welcome ark restaurants fourth quarter full year 2016 results conference time participants mode session follow formal presentation operator instructions reminder conference recorded would like turn conference host bob stewart president chief financial officer thank may begin

##################