Execute `huggingface-cli delete-cache` in the terminal to select which models you want to clear from the cache.

In [1]:
import pickle

from sentence_transformers import SentenceTransformer

import helpers
from helpers import Paper

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Load the model

model_name = 'sentence-transformers/all-MiniLM-L6-v2'

model = SentenceTransformer(model_name)



In [3]:
# Load the papers and encode them into embeddings

with open('papers.pkl', 'rb') as f:
    papers: list[Paper] = pickle.load(f)

papers_text = [f'Title: {paper.title} \n Abstract: {paper.abstract}' for paper in papers]

papers_emb = model.encode(papers_text)

## Categories

In [4]:
# Predict the category of each paper

categories_emb = model.encode(helpers.categories)

similarities = model.similarity(papers_emb, categories_emb)

predictions = list(map(lambda index: helpers.categories[index], similarities.argmax(dim=1)))

In [5]:
# Save the predictions

helpers.save_to_csv(papers, predictions, 'similarity-preds.csv')

helpers.save_to_json(papers, predictions, 'similarity-preds.json')

## Extended Categories

In [6]:
# Extend the categories and predict the category of each paper

extended_categories = [
    'Tables are structured representations of data organized in rows and columns, often used to present numerical information, comparisons, and relationships clearly and efficiently.', 
    'Classification is the task of assigning predefined categories to text documents based on their content, enabling systematic organization and retrieval of information.', 
    'Key Information Extraction is the automatic identification and extraction of significant entities and relevant data from unstructured texts, facilitating efficient access to critical information and enhancing data organization.',
    'Optical Character Recognition is the technology used to convert different types of documents, such as scanned paper documents and images, into editable and searchable data by recognizing and extracting printed or handwritten text.', 
    'Datasets are ollections of structured or unstructured data organized for analysis and research purposes, often used in machine learning and statistical modeling to train and evaluate algorithms.', 
    'Document Layout Understanding is the process of analyzing and interpreting the structural layout of documents to extract meaningful information about the arrangement and organization of content, including text, images, tables, and other elements.', 
    'Others are any additional tasks or methodologies related to document processing and information extraction that do not fit into the predefined categories, encompassing a variety of techniques and applications.'
]

extended_categories_emb = model.encode(extended_categories)

similarities_extended = model.similarity(papers_emb, extended_categories_emb)

predictions_extended = list(map(lambda index: helpers.categories[index], similarities_extended.argmax(dim=1)))

In [7]:
# Save the predictions

helpers.save_to_csv(papers, predictions_extended, 'similarity-preds-ext.csv')

helpers.save_to_json(papers, predictions_extended, 'similarity-preds-ext.json')