In [1]:
# importing required libraries

import pandas as pd
import numpy as np
import json
import re
import os
import torch
from transformers import AutoTokenizer, AutoModel

In [2]:
# creating the class

class ContrieverCB:
    def __init__(self):
        pass
    
    
    def clean(self, text: str) -> str:
        """
        Function to remove newline from text.
        :param text: input string
        :return: string without newline
        """
        new_text = re.sub('\n', '', text)
        return new_text
    
    
    def mean_pooling(self, token_embeddings, mask):
        """
        Function to be used after model is applied to tokenized text to generate embeddings.
        Used in the HuggingFace version.
        :param token_embeddings: output of model
        :param mask: attention mask of the tokens
        :return: tensors of the text
        """
        token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
        sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
        return sentence_embeddings
    
    
    def generate_embeddings(self, path_to_json: str, path_to_output: str) -> None:
        """
        Function takes input json filepath, generates numpy embeddings of the file and
        saves them at the given output filepath.
        :param path_to_json: input filepath
        :param path_to_output: output filepath
        :return: None
        """
        tokenizer = AutoTokenizer.from_pretrained('facebook/contriever-msmarco')
        model = AutoModel.from_pretrained('facebook/contriever-msmarco')
        
        # open and read the input json file
        file = open(path_to_json)
        json_data = json.load(file)
        
        n = int(len(json_data)/100)
        embeddings_list = []
        
        # take 100 units at a time and process it
        for k in range(n):
            if k==n:
                start = k*100
                end = (list(json_data.keys())[-1])
            else:
                start = k*100
                end = k*100+99
                
            for i in range(start, end):
                text = json_data[str(i)]
                text = self.clean(text)
                tokenized_text = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
                output = model(**tokenized_text)
                embeddings = self.mean_pooling(output[0], tokenized_text['attention_mask'])
                embeddings_np = embeddings.detach().numpy()
                embeddings_list.append(embeddings_np)
                
        # convert embeddings list to numpy array
        embeddings_array = np.array(embeddings_list)
        
        # reshape the numpy array
        x = embeddings_array.shape[0]
        y = embeddings_array.shape[2]
        embeddings_array.reshape((x,y))
        
        # save the embeddings in a numpy file
        filename = path_to_json.split('\\')[-1].split('.')[0]
        filepath = os.path.join(path_to_output, filename)

        np.save(filepath, embeddings_array)

In [3]:
input_path = 'F:\MSIM\Sem 3\Independent Study\Projects\data-generator\split_textbook\paragraphs.json'
output_path = 'F:\MSIM\Sem 3\Independent Study\Projects\embeddings'

c1 = ContrieverCB()

c1.generate_embeddings(input_path, output_path)

0
99
100
199
200
299
300
399
400
499
500
599
600
699
700
799
800
899
900
999
1000
1099
1100
1199
1200
1299
1300
1399
1400
1499
1500
1599
1600
1699
1700
1799
1800
1899
1900
1999
paragraphs
F:\MSIM\Sem 3\Independent Study\Projects\embeddings\paragraphs
