In [1]:
# importing required libraries

import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import json
import re
import numpy as np

In [2]:
tokenizer = AutoTokenizer.from_pretrained('facebook/contriever-msmarco')
model = AutoModel.from_pretrained('facebook/contriever-msmarco')

In [3]:
# sample data
sentences = [
    "Where was Marie Curie born?",
    "Maria Sklodowska, later known as Marie Curie, was born on November 7, 1867.",
    "Born in Paris on 15 May 1859, Pierre Curie was the son of Eugène Curie, a doctor of French Catholic origin from Alsace."
]

In [4]:
# Apply tokenizer

inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

In [5]:
# Compute token embeddings

outputs = model(**inputs)

In [6]:
# Mean pooling

def mean_pooling(token_embeddings, mask):
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings

Work on generated data

In [7]:
# read json file

f = open("paragraphs.json")
data = json.load(f)

In [8]:
# cleaning the data

def clean(text):
    new_text = re.sub('\n', '', text)
    return new_text

In [9]:
# clean data --> apply tokenizer --> apply model --> mean pooling --> embeddings --> store in list --> convert to numpy array

In [67]:
n = int(len(data)/100)
embeddings_list = []
for k in range(n):
    
    if(k==n):
        start = k*100
        end = (list(data.keys())[-1])
    else:
        start = k*100
        end = k*100+99
        
    for i in range(start, end):
        para = data[str(i)]
        para = clean(para)
        tokenized_para = tokenizer(para, padding=True, truncation=True, return_tensors='pt')
        output_para = model(**tokenized_para)
        embeddings_para = mean_pooling(output_para[0], tokenized_para['attention_mask'])
        numpy_embeddings = embeddings_para.detach().numpy()
        embeddings_list.append(numpy_embeddings)

embeddings_numpy = np.array(embeddings_list)

In [68]:
# save the embeddings in a numpy file

np.save('test_embeddings', embeddings_numpy)

In [69]:
embeddings_numpy.shape

(1980, 1, 768)

In [70]:
# load saved embeddings from file

saved_embeddings = np.load('test_embeddings.npy')

In [71]:
saved_embeddings.shape

(1980, 1, 768)