In [1]:
# importing required libraries

import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import json
import re
import numpy as np

In [2]:
tokenizer = AutoTokenizer.from_pretrained('facebook/contriever-msmarco')
model = AutoModel.from_pretrained('facebook/contriever-msmarco')

In [3]:
# Mean pooling

def mean_pooling(token_embeddings, mask):
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings

Generating embeddings for paragraph level text

In [4]:
# read json file

f = open("paragraphs.json")
data = json.load(f)

In [5]:
# cleaning the data

def clean(text):
    new_text = re.sub('\n', '', text)
    return new_text

In [6]:
# clean data --> apply tokenizer --> apply model --> mean pooling --> embeddings --> store in list --> convert to numpy array

In [7]:
n = int(len(data)/100)
embeddings_list = []
for k in range(n):
    
    if(k==n):
        start = k*100
        end = (list(data.keys())[-1])
    else:
        start = k*100
        end = k*100+99
        
    for i in range(start, end):
        para = data[str(i)]
        para = clean(para)
        tokenized_para = tokenizer(para, padding=True, truncation=True, return_tensors='pt')
        output_para = model(**tokenized_para)
        embeddings_para = mean_pooling(output_para[0], tokenized_para['attention_mask'])
        numpy_embeddings = embeddings_para.detach().numpy()
        embeddings_list.append(numpy_embeddings)

embeddings_numpy = np.array(embeddings_list)

In [11]:
# save the embeddings in a numpy file

np.save('paragraph_embeddings', embeddings_numpy)

In [12]:
embeddings_numpy.shape

(1980, 1, 768)

In [13]:
# load saved embeddings from file

saved_embeddings = np.load('paragraph_embeddings.npy')

In [14]:
saved_embeddings.shape

(1980, 1, 768)

Generating embeddings for sentence level text

In [17]:
fs = open("sentences.json")
sentence_data = json.load(fs)

In [19]:
len(sentence_data)

4243

In [21]:
n = int(len(sentence_data)/100)
embeddings_list_line = []
for k in range(n):
    
    if(k==n):
        start = k*100
        end = (list(sentence_data.keys())[-1])
    else:
        start = k*100
        end = k*100+99
        
    for i in range(start, end):
        line = sentence_data[str(i)]
        line = clean(line)
        tokenized_line = tokenizer(line, padding=True, truncation=True, return_tensors='pt')
        output_line = model(**tokenized_line)
        embeddings_line = mean_pooling(output_line[0], tokenized_line['attention_mask'])
        numpy_embeddings_line = embeddings_line.detach().numpy()
        embeddings_list_line.append(numpy_embeddings_line)

np_embeddings_line = np.array(embeddings_list_line)

In [22]:
np.save('sentence_embeddings', np_embeddings_line)

In [23]:
np_embeddings_line.shape

(4158, 1, 768)

In [24]:
saved_line_embeddings = np.load('sentence_embeddings.npy')
saved_line_embeddings.shape

(4158, 1, 768)

Generate section level embeddings

In [25]:
fsec = open("sections.json")
section_data = json.load(fsec)

In [28]:
n = int(len(section_data)/100)
embeddings_list_section = []
for k in range(n):
    
    if(k==n):
        start = k*100
        end = (list(section_data.keys())[-1])
    else:
        start = k*100
        end = k*100+99
        
    for i in range(start, end):
        section = section_data[str(i)]
        section = clean(section)
        tokenized_section = tokenizer(section, padding=True, truncation=True, return_tensors='pt')
        output_section = model(**tokenized_section)
        embeddings_section = mean_pooling(output_section[0], tokenized_section['attention_mask'])
        numpy_embeddings_section = embeddings_section.detach().numpy()
        embeddings_list_section.append(numpy_embeddings_section)

np_embeddings_section = np.array(embeddings_list_section)

In [32]:
np.save('section_embeddings', np_embeddings_section)

In [33]:
np_embeddings_line.shape

(4158, 1, 768)