## TODO 
- maybe integrate the arxiv api and the similarity computation together (you can use the user query to use in the arxiv api)
- somehow combine a chatbot with the retrieved papers


In [1]:
import sqlite3
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Connect to the SQLite database (or create it if it doesn't exist)
conn = sqlite3.connect("arxiv_papers.db")
cur = conn.cursor()

# Load the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2') 

# NOTE: sanity check, the title of a paper
query = "Personalized Emphasis Framing for Persuasive Message"

#  Get the vector for the query
query_embedding = model.encode([query])

#  Fetch papers from the database
cur.execute("SELECT id, title, summary FROM papers")
papers = cur.fetchall()

#   Encode the summaries of the papers
paper_embeddings = [model.encode([paper[2]]) for paper in papers]  # paper[2] is the summary

#   Compute cosine similarities between the query and the paper summaries
similarities = []
for idx, paper_embedding in enumerate(paper_embeddings):
    similarity = cosine_similarity(query_embedding, paper_embedding)
    similarities.append((papers[idx], similarity[0][0]))  # (paper, similarity score)

#  Sort papers by similarity 
similarities.sort(key=lambda x: x[1], reverse=True)

#   Print the most similar papers
print("Most similar papers to your query:")
for paper, similarity in similarities[:10]:
    print(f"ID: {paper[0]}")
    print(f"Title: {paper[1]}")
    print(f"Similarity: {similarity:.4f}")
    print(f"Summary: {paper[2]}")
    print('-' * 80)

#   Close the connection to the database
conn.close()

Most similar papers to your query:
ID: 324
Title: Personalized Emphasis Framing for Persuasive Message Generation
Similarity: 0.6874
Summary: In this paper, we present a study on personalized emphasis framing which can
be used to tailor the content of a message to enhance its appeal to different
individuals. With this framework, we directly model content selection decisions
based on a set of psychologically-motivated domain-independent personal traits
including personality (e.g., extraversion and conscientiousness) and basic
human values (e.g., self-transcendence and hedonism). We also demonstrate how
the analysis results can be used in automated personalized content selection
for persuasive message generation.
--------------------------------------------------------------------------------
ID: 490
Title: A Survey of Personalized Large Language Models: Progress and Future Directions
Similarity: 0.4751
Summary: Large Language Models (LLMs) excel in handling general knowledge tasks, yet
