## TODO 
- maybe integrate the arxiv api and the similarity computation together (you can use the user query to use in the arxiv api)
- somehow combine a chatbot with the retrieved papers


In [39]:
import sqlite3
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import arxiv

In [42]:
# Load the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2') 

# Define your query
user_query = "Best ML models for making embeddings"

# Get the embedding for the query
query_embedding = model.encode([user_query])

# Use arxiv to search for papers (limit to 100 results)
search = arxiv.Search(
    query=user_query,
    max_results=100,
    sort_by=arxiv.SortCriterion.Relevance,
    sort_order=arxiv.SortOrder.Descending
      )
results = list(search.results())  # Convert generator to list

# Extract summaries and titles
papers = []
summaries = []
for result in results:
    title = result.title
    authors = ', '.join([author.name for author in result.authors])
    summary = result.summary
    url = f"https://arxiv.org/abs/{result.entry_id.split('/')[-1]}"
    papers.append({
        "title": title,
        "authors": authors,
        "summary": summary,
        "url": url
    })
    summaries.append(summary)

# Encode all summaries
summary_embeddings = model.encode(summaries)

# Compute cosine similarities
similarities = cosine_similarity(query_embedding, summary_embeddings)[0]

# Attach similarity scores to papers and sort
for i, paper in enumerate(papers):
    paper["similarity"] = similarities[i]

top_papers = sorted(papers, key=lambda x: x["similarity"], reverse=True)[:5]

# Print top 5 similar papers
for i, paper in enumerate(top_papers, 1):
    print(f"Rank #{i}")
    print(f"Title: {paper['title']}")
    print(f"Authors: {paper['authors']}")
    print(f"Summary: {paper['summary']}")
    print(f"Similarity: {paper['similarity']:.4f}")
    print(f"URL: {paper['url']}")
    print("-" * 80)


  results = list(search.results())  # Convert generator to list


Rank #1
Title: Pretrained Embeddings for E-commerce Machine Learning: When it Fails and Why?
Authors: Da Xu, Bo Yang
Summary: The use of pretrained embeddings has become widespread in modern e-commerce
machine learning (ML) systems. In practice, however, we have encountered
several key issues when using pretrained embedding in a real-world production
system, many of which cannot be fully explained by current knowledge.
Unfortunately, we find that there is a lack of a thorough understanding of how
pre-trained embeddings work, especially their intrinsic properties and
interactions with downstream tasks. Consequently, it becomes challenging to
make interactive and scalable decisions regarding the use of pre-trained
embeddings in practice.
  Our investigation leads to two significant discoveries about using pretrained
embeddings in e-commerce applications. Firstly, we find that the design of the
pretraining and downstream models, particularly how they encode and decode
information via embe

In [43]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

model_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

rag = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

# Combine summaries into a context string, but make sure it's within the token limit
context = "\n\n".join(
    f"Title: {paper['title']}\nSummary: {paper['summary']}" for paper in top_papers
)

# Encode the context and check its length
input_ids = tokenizer.encode(context, return_tensors="pt")
max_length = 512  # Adjust this based on your model's max token length

# Truncate if necessary to fit within the max token limit
if input_ids.shape[1] > max_length:
    input_ids = input_ids[:, :max_length]


# Prepare the prompt, ensuring it stays within the token limit
prompt = f"""Here are some research papers:

{context[:max_length]}  # Only include a truncated context if necessary

Use the above research paper summaries to answer the following question:

Question: {user_query}
Answer:"""

# Generate the answer using the same prompt
output = rag(prompt, max_new_tokens=300)

# Provide the generated answer along with the papers
print("Research Papers and Generated Answer:")
print(f"Research Papers:\n{context[:max_length]}")  # Display truncated context
print(f"Generated Answer:\n{output[0]['generated_text']}")


Device set to use cpu
Token indices sequence length is longer than the specified maximum sequence length for this model (1486 > 512). Running this sequence through the model will result in indexing errors


Research Papers and Generated Answer:
Research Papers:
Title: Pretrained Embeddings for E-commerce Machine Learning: When it Fails and Why?
Summary: The use of pretrained embeddings has become widespread in modern e-commerce
machine learning (ML) systems. In practice, however, we have encountered
several key issues when using pretrained embedding in a real-world production
system, many of which cannot be fully explained by current knowledge.
Unfortunately, we find that there is a lack of a thorough understanding of how
pre-trained embeddings work, especially th
Generated Answer:
Pretrained Embeddings for E-commerce Machine Learning: When it Fails and Why?
