In [1]:
# importing required libraries

import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import json
import re
import numpy as np

In [2]:
tokenizer = AutoTokenizer.from_pretrained('facebook/contriever-msmarco')
model = AutoModel.from_pretrained('facebook/contriever-msmarco')

In [3]:
# Mean pooling

def mean_pooling(token_embeddings, mask):
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings

Generating embeddings for paragraph level text

In [4]:
# read json file

f = open("paragraphs.json")
paragraph_data_json = json.load(f)

In [5]:
# cleaning the data

def clean(text):
    new_text = re.sub('\n', '', text)
    return new_text

In [None]:
# clean data --> apply tokenizer --> apply model --> mean pooling --> embeddings --> store in list --> convert to numpy array

In [None]:
# converting json data to a list ---> contriever input is a list.

In [8]:
paragraph_data = list(paragraph_data_json.values())

In [12]:
n = int(len(paragraph_data_json)/100)
embeddings_list = []
for k in range(n):
    
    if(k==n):
        start = k*100
        end = (list(data.keys())[-1])
    else:
        start = k*100
        end = k*100+99
        
    for i in range(start, end):
        para = paragraph_data_json[str(i)]
        para = clean(para)
        tokenized_para = tokenizer(para, padding=True, truncation=True, return_tensors='pt')
        output_para = model(**tokenized_para)
        embeddings_para = mean_pooling(output_para[0], tokenized_para['attention_mask'])
        numpy_embeddings = embeddings_para.detach().numpy()
        embeddings_list.append(numpy_embeddings)
        #embeddings_list.append(embeddings_para)

embeddings_numpy = np.array(embeddings_list)

In [24]:
embeddings_numpy.shape

(1980, 768)

In [23]:
embeddings_numpy = embeddings_numpy.reshape((1980, 768))

In [25]:
# save the embeddings in a numpy file

np.save('paragraph_embeddings', embeddings_numpy)

Generating embeddings for sentence level text

In [30]:
fs = open("sentences.json")
sentence_data = json.load(fs)

In [31]:
n = int(len(sentence_data)/100)
embeddings_list_line = []
for k in range(n):
    
    if(k==n):
        start = k*100
        end = (list(sentence_data.keys())[-1])
    else:
        start = k*100
        end = k*100+99
        
    for i in range(start, end):
        line = sentence_data[str(i)]
        line = clean(line)
        tokenized_line = tokenizer(line, padding=True, truncation=True, return_tensors='pt')
        output_line = model(**tokenized_line)
        embeddings_line = mean_pooling(output_line[0], tokenized_line['attention_mask'])
        numpy_embeddings_line = embeddings_line.detach().numpy()
        embeddings_list_line.append(numpy_embeddings_line)

np_embeddings_line = np.array(embeddings_list_line)

In [32]:
np_embeddings_line.shape

(4158, 1, 768)

In [36]:
np_embeddings_line = np_embeddings_line.reshape((4158, 768))

In [37]:
np.save('sentence_embeddings', np_embeddings_line)

In [38]:
np_embeddings_line.shape

(4158, 768)

In [39]:
saved_line_embeddings = np.load('sentence_embeddings.npy')
saved_line_embeddings.shape

(4158, 768)

Generate section level embeddings

In [40]:
fsec = open("sections.json")
section_data = json.load(fsec)

In [41]:
n = int(len(section_data)/100)
embeddings_list_section = []
for k in range(n):
    
    if(k==n):
        start = k*100
        end = (list(section_data.keys())[-1])
    else:
        start = k*100
        end = k*100+99
        
    for i in range(start, end):
        section = section_data[str(i)]
        section = clean(section)
        tokenized_section = tokenizer(section, padding=True, truncation=True, return_tensors='pt')
        output_section = model(**tokenized_section)
        embeddings_section = mean_pooling(output_section[0], tokenized_section['attention_mask'])
        numpy_embeddings_section = embeddings_section.detach().numpy()
        embeddings_list_section.append(numpy_embeddings_section)

np_embeddings_section = np.array(embeddings_list_section)

In [43]:
np_embeddings_section = np_embeddings_section.reshape((99,768))

In [44]:
np_embeddings_section.shape

(99, 768)

In [45]:
np.save('section_embeddings', np_embeddings_section)

Encode questions and retrieve the most relevant sentences/paragraphs/sections
    --> generate and test on your own sample questions for now
    --> ask group for proper questions after testing above embeddings

Retrieving most relevant section

In [83]:
saved_section_embeddings = np.load('section_embeddings.npy')

In [112]:
len(saved_section_embeddings)

99

In [92]:
section_tensors = torch.from_numpy(saved_section_embeddings)

In [140]:
questions = ['What is a procedure?']

In [141]:
tokenized_question = tokenizer(questions[0], padding=True, truncation=True, return_tensors='pt')
output_question = model(**tokenized_question)
embeddings_question = mean_pooling(output_question[0], tokenized_question['attention_mask'])

In [126]:
saved_section_embeddings.shape

(99, 768)

In [127]:
len(section_tensors)

99

In [142]:
dot_product_section = {}
for i in range(len(section_tensors)):
    dot_product_section[embeddings_question[0]@section_tensors[i]] = i

In [143]:
max_prod_tensor = max(dot_product_section.keys())
print(dot_product_section[max_prod_tensor])

50


In [157]:
section_data['50']

"{Procedure and System Calls*}\n\nA { procedure} is a sequence of instructions that executes a\nparticular task.  Procedures are used as building blocks for multiple,\nlarger tasks.  The concept of a procedure is fundamental to\nprogramming, and appears in some form in every high-level language as\nwell as in most ISAs.\n\nFor our purposes, the terms procedure, subroutine,\nfunction, and method are synonymous, although they usually have\nslightly different meanings from the linguistic point of view.\nProcedure calls are supported through { call} and { return}\ncontrol flow instructions.  The first instruction in the code below,\nfor example, transfers control to the procedure ``DoSomeWork,'' which\npresumably does some work, then returns control to the instruction\nfollowing the call.\n\n{-6pt}\n\n=DoSomeWork:WW=WWWW=DoSomeWorkWW= \n>loop:>CALL>DoSomeWork\n>>CMP>R6,#1>; compare return value in R6 to 1\n>>BEQ>loop>; keep doing work until R6 is not 1\n\n>DoSomeWork:>>> ; set R6 to 0 when

In [158]:
saved_para_embeddings = np.load('paragraph_embeddings.npy')

In [159]:
para_tensors = torch.from_numpy(saved_para_embeddings)

In [160]:
dot_product_para = {}
for i in range(len(para_tensors)):
    dot_product_para[embeddings_question[0]@para_tensors[i]] = i

In [161]:
max_para_tensor = max(dot_product_para.keys())
print(dot_product_para[max_para_tensor])

688


In [162]:
paragraph_data_json[str(688)]

'The { zero register} appears in modern architectures of the RISC\nvariety (defined in the next section of these notes).  The register is\nread-only and serves both as a useful constant and as a destination\nfor operations performed only for their side-effects (for example, setting\nstatus bits).  The availability of a zero register also allows certain\nopcodes to serve double duty.  A register-to-register add instruction\nbecomes a register move instruction when one source operand is zero.\nSimilarly, an immediate add instruction becomes an immediate load\ninstruction when one source operand is zero.'

In [164]:
saved_sentence_embeddings = np.load('sentence_embeddings.npy')
sentence_tensors = torch.from_numpy(saved_sentence_embeddings)

In [165]:
dot_product_sentence = {}
for i in range(len(sentence_tensors)):
    dot_product_sentence[embeddings_question[0]@sentence_tensors[i]] = i

In [166]:
max_sentence_tensor = max(dot_product_sentence.keys())
print(dot_product_sentence[max_sentence_tensor])

1500


In [167]:
sentence_data['1500']

"''\n\nThe impact of increasingly dense integrated circuit technology had\nbegun to have its effect, however, and in view of increasing processor\nclock speeds, more and more programmers were using high-level\nlanguages rather than writing assembly code."

Try to form and test better questions. Sentence and para matchings are weird.