## TODO 
- maybe integrate the arxiv api and the similarity computation together (you can use the user query to use in the arxiv api)
- somehow combine a chatbot with the retrieved papers


In [1]:
import sqlite3
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Connect to the SQLite database (or create it if it doesn't exist)
conn = sqlite3.connect("arxiv_papers.db")
cur = conn.cursor()

# Load the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2') 

# NOTE: sanity check, a sentence from a papers abstract
query = "The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer,"

#  Get the vector for the query
query_embedding = model.encode([query])

#  Fetch papers from the database
cur.execute("SELECT id, title, summary FROM papers")
papers = cur.fetchall()

#   Encode the summaries of the papers
paper_embeddings = [model.encode([paper[2]]) for paper in papers]  # paper[2] is the summary

#   Compute cosine similarities between the query and the paper summaries
similarities = []
for idx, paper_embedding in enumerate(paper_embeddings):
    similarity = cosine_similarity(query_embedding, paper_embedding)
    similarities.append((papers[idx], similarity[0][0]))  # (paper, similarity score)

#  Sort papers by similarity 
similarities.sort(key=lambda x: x[1], reverse=True)

#   Print the most similar papers
print("Most similar papers to your query:")
for paper, similarity in similarities[:10]:
    print(f"ID: {paper[0]}")
    print(f"Title: {paper[1]}")
    print(f"Similarity: {similarity:.4f}")
    print(f"Summary: {paper[2]}")
    print('-' * 80)

#   Close the connection to the database
conn.close()

Most similar papers to your query:
ID: 77
Title: Fine Grained Knowledge Transfer for Personalized Task-oriented Dialogue Systems
Similarity: 0.4581
Summary: Training a personalized dialogue system requires a lot of data, and the data
collected for a single user is usually insufficient. One common practice for
this problem is to share training dialogues between different users and train
multiple sequence-to-sequence dialogue models together with transfer learning.
However, current sequence-to-sequence transfer learning models operate on the
entire sentence, which might cause negative transfer if different personal
information from different users is mixed up. We propose a personalized decoder
model to transfer finer granularity phrase-level knowledge between different
users while keeping personal preferences of each user intact. A novel personal
control gate is introduced, enabling the personalized decoder to switch between
generating personalized phrases and shared phrases. The propose