# Retrieval-Augmented Generation

In [5]:
import json
from qdrant_client import QdrantClient, models
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm

In [6]:
with open("/data2/arxiv/arxiv-astro_ph.json") as fin:
    docs = []
    for line in fin:
        docs.append(json.loads(line))

In [3]:
# need a model to generate sentence embeddings for your documents
model = SentenceTransformer("all-MiniLM-L6-v2")



In [13]:
# create a vector database

# client = QdrantClient(":memory:")
client = QdrantClient(path="db")
client.create_collection(
    collection_name="abstracts",
    vectors_config=models.VectorParams(
        size=model.get_sentence_embedding_dimension(),
        distance=models.Distance.COSINE
    )
)

True

In [14]:
# add documents (slow)

client.upload_points(
    collection_name="abstracts",
    points=[
        models.PointStruct(
            id=idx,
            vector=model.encode(doc["abstract"]).tolist(),
            payload=doc
        )
        for idx, doc in tqdm(enumerate(docs), total=len(docs))
    ]
)


  0%|          | 0/349073 [00:00<?, ?it/s]

In [16]:
def query(text):
    hits = client.query_points(
        collection_name="abstracts",
        query=model.encode(text).tolist(),
        limit=10,
    ).points

    for hit in hits:
        print(hit.payload, "score:", hit.score)

In [17]:
query(input("Enter a query:"))

{'id': 'astro-ph/9412071', 'submitter': 'Alain Lecavelier', 'authors': 'A. Lecavelier des Etangs, M. Deleuil, A. Vidal-Madjar et al', 'title': 'BETA PICTORIS : Evidence of light variations', 'comments': '7 pages, l-aa LaTeX file + 4 appended figures in uuencoded .ps; To\n  appear in Astronomy & Astrophysics. Postscript version of paper and figures\n  also available at ftp://corton.iap.fr/pub/from_users/lecaveli/', 'journal-ref': 'Astron.Astrophys.299:557,1995', 'doi': None, 'report-no': None, 'categories': 'astro-ph', 'license': None, 'abstract': '  We have analyzed Beta Pictoris photometric measurements obtained from La\nSilla by the Geneva Observatory from 1975 to 1992. These data show evidence of\nvariations in the brightness of the star, with no color dependency. Here, we\ndemonstrate that the light variations are present on long as well as on short\ntime scales. On a long time scale, we show that the apparent magnitude of Beta\nPic decreased by 0.011 +/- 0.004 mag from 1979 to 198

In [21]:
# Write a chatbot with RAG!

import ollama

def query(client, model, text):
    hits = client.query_points(
        collection_name="abstracts",
        query=model.encode(text).tolist(),
        limit=10,
    ).points

    # for hit in hits:
        # print(hit.payload, "score:", hit.score)

    return [hit.payload for hit in hits]
    

def chat(client, model):
    messages = [
        {"role": "system", "content": "You are an expert astronomer who has read all of arXiv astro-ph."},
    ]

    while True:
        user_input = input("What do you want to say: ")
        
        # Retrieval
        docs = query(client, model, user_input)

        # Augment
        prompt = "\n".join([doc["abstract"] for doc in docs])
        prompt += "\nBased on the above documents, answer this query: " + user_input
        messages.append({"role": "user", "content": prompt})

        # Generation
        response = ollama.chat("llama3.2", messages=messages)
        
        print(response["message"]["content"])  # {"role": "assistant", "content": "..."}
        
        messages.append(response["message"])


def main():
    model = SentenceTransformer("all-MiniLM-L6-v2")
    client = QdrantClient(path="db")

    chat(client, model)
