In [1]:
# importing required libraries

import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import json
import re
import numpy as np

In [2]:
tokenizer = AutoTokenizer.from_pretrained('facebook/contriever-msmarco')
model = AutoModel.from_pretrained('facebook/contriever-msmarco')

In [3]:
# Mean pooling

def mean_pooling(token_embeddings, mask):
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings

Generating embeddings for paragraph level text

In [4]:
# read json file

f = open("paragraphs.json")
paragraph_data_json = json.load(f)

In [5]:
# cleaning the data

def clean(text):
    new_text = re.sub('\n', '', text)
    return new_text

In [None]:
# clean data --> apply tokenizer --> apply model --> mean pooling --> embeddings --> store in list --> convert to numpy array

In [None]:
# converting json data to a list ---> contriever input is a list.

In [8]:
n = int(len(paragraph_data_json)/100)
embeddings_list = []
for k in range(n):
    
    if(k==n):
        start = k*100
        end = (list(data.keys())[-1])
    else:
        start = k*100
        end = k*100+99
        
    for i in range(start, end):
        para = paragraph_data_json[str(i)]
        para = clean(para)
        tokenized_para = tokenizer(para, padding=True, truncation=True, return_tensors='pt')
        output_para = model(**tokenized_para)
        embeddings_para = mean_pooling(output_para[0], tokenized_para['attention_mask'])
        numpy_embeddings = embeddings_para.detach().numpy()
        embeddings_list.append(numpy_embeddings)

embeddings_numpy = np.array(embeddings_list)

In [9]:
embeddings_numpy.shape

(1980, 1, 768)

In [10]:
embeddings_numpy = embeddings_numpy.reshape((1980, 768))

In [11]:
# save the embeddings in a numpy file

np.save('paragraph_embeddings', embeddings_numpy)

Generating embeddings for sentence level text

In [12]:
fs = open("sentences.json")
sentence_data = json.load(fs)

In [13]:
n = int(len(sentence_data)/100)
embeddings_list_line = []
for k in range(n):
    
    if(k==n):
        start = k*100
        end = (list(sentence_data.keys())[-1])
    else:
        start = k*100
        end = k*100+99
        
    for i in range(start, end):
        line = sentence_data[str(i)]
        line = clean(line)
        tokenized_line = tokenizer(line, padding=True, truncation=True, return_tensors='pt')
        output_line = model(**tokenized_line)
        embeddings_line = mean_pooling(output_line[0], tokenized_line['attention_mask'])
        numpy_embeddings_line = embeddings_line.detach().numpy()
        embeddings_list_line.append(numpy_embeddings_line)

np_embeddings_line = np.array(embeddings_list_line)

In [32]:
np_embeddings_line.shape

(4158, 1, 768)

In [36]:
np_embeddings_line = np_embeddings_line.reshape((4158, 768))

In [37]:
np.save('sentence_embeddings', np_embeddings_line)

In [38]:
np_embeddings_line.shape

(4158, 768)

In [39]:
saved_line_embeddings = np.load('sentence_embeddings.npy')
saved_line_embeddings.shape

(4158, 768)

Generate section level embeddings

In [40]:
fsec = open("sections.json")
section_data = json.load(fsec)

In [41]:
n = int(len(section_data)/100)
embeddings_list_section = []
for k in range(n):
    
    if(k==n):
        start = k*100
        end = (list(section_data.keys())[-1])
    else:
        start = k*100
        end = k*100+99
        
    for i in range(start, end):
        section = section_data[str(i)]
        section = clean(section)
        tokenized_section = tokenizer(section, padding=True, truncation=True, return_tensors='pt')
        output_section = model(**tokenized_section)
        embeddings_section = mean_pooling(output_section[0], tokenized_section['attention_mask'])
        numpy_embeddings_section = embeddings_section.detach().numpy()
        embeddings_list_section.append(numpy_embeddings_section)

np_embeddings_section = np.array(embeddings_list_section)

In [43]:
np_embeddings_section = np_embeddings_section.reshape((99,768))

In [44]:
np_embeddings_section.shape

(99, 768)

In [45]:
np.save('section_embeddings', np_embeddings_section)

Encode questions and retrieve the most relevant sentences/paragraphs/sections
    --> generate and test on your own sample questions for now
    --> ask group for proper questions after testing above embeddings

Retrieving most relevant section

Try to form and test better questions. Sentence and para matchings are weird.