# Sentence Embeddings

In [3]:
import pandas as pd
from pathlib import Path

In [None]:
dataset = pd.read_csv('../data/filtered_data.csv')
dataset.head()

Unnamed: 0,titles,abstracts,terms,urls
0,Benchmarking Human and Automated Prompting in ...,The remarkable capabilities of the Segment Any...,['cs.CV'],http://arxiv.org/abs/2410.22048v1
1,CAMS: Convolution and Attention-Free Mamba-bas...,Convolutional Neural Networks (CNNs) and Trans...,['cs.CV'],http://arxiv.org/abs/2406.05786v3
2,Global-Local Medical SAM Adaptor Based on Full...,"Emerging of visual language models, such as th...","['cs.AI', 'cs.CV']",http://arxiv.org/abs/2409.17486v2
3,Image Segmentation in Foundation Model Era: A ...,Image segmentation is a long-standing challeng...,['cs.CV'],http://arxiv.org/abs/2408.12957v2
4,Empirical curvelet based Fully Convolutional N...,"In this paper, we propose a new approach to pe...",['cs.CV'],http://arxiv.org/abs/2410.21562v1


## Embeddings

In [6]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.1-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.8/255.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.2.1


In [7]:
sentences = dataset['titles']
abstracts = dataset['abstracts']

In [8]:
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [9]:
# Initialize the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
embeddings = model.encode(sentences, batch_size=32, show_progress_bar=True)

Batches:   0%|          | 0/1838 [00:00<?, ?it/s]

In [10]:
embeddings = model.encode(abstracts, batch_size=32, show_progress_bar=True)

Batches:   0%|          | 0/1838 [00:00<?, ?it/s]

In [None]:
c = 0
for sentence, embedding in zip(sentences, embeddings):

    print("Sentence:", sentence)
    print("Embedding dimension:", len(embedding))
    print("Title length:", len(sentence))
    print("")

    if c >=5:
        break
    c +=1 

Sentence: Benchmarking Human and Automated Prompting in the Segment Anything Model
Embedding dimension: 384
Title length: 72

Sentence: CAMS: Convolution and Attention-Free Mamba-based Cardiac Image Segmentation
Embedding dimension: 384
Title length: 75

Sentence: Global-Local Medical SAM Adaptor Based on Full Adaption
Embedding dimension: 384
Title length: 55

Sentence: Image Segmentation in Foundation Model Era: A Survey
Embedding dimension: 384
Title length: 52

Sentence: Empirical curvelet based Fully Convolutional Network for supervised texture image segmentation
Embedding dimension: 384
Title length: 94

Sentence: SAM 2: Segment Anything in Images and Videos
Embedding dimension: 384
Title length: 44



In [None]:
import pickle

# Saving sentences and corresponding embeddings
with open('../models/embeddings/embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

with open('../models/sentences/sentences.pkl', 'wb') as f:
    pickle.dump(sentences, f)

with open('../models/abstracts/abstracts.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

In [None]:
paper_you_like = "Language Models and using graphRAG"

In [50]:
from sentence_transformers import util
cosine_scores = util.cos_sim(embeddings, model.encode(paper_you_like))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [51]:
import torch
top_similar_papers = torch.topk(cosine_scores,dim=0, k=5,sorted=True)
top_similar_papers

torch.return_types.topk(
values=tensor([[0.5828],
        [0.5248],
        [0.5180],
        [0.5027],
        [0.4988]]),
indices=tensor([[26528],
        [25691],
        [25857],
        [26269],
        [26298]]))

In [None]:
for i in top_similar_papers.indices:
    print(sentences[i.item()])

Graph Retrieval-Augmented Generation: A Survey
ChatGraph: Chat with Your Graphs
GLBench: A Comprehensive Benchmark for Graph with Large Language Models
How Do Large Language Models Understand Graph Patterns? A Benchmark for Graph Pattern Comprehension
Lost-in-Distance: Impact of Contextual Proximity on LLM Performance in Graph Tasks
