# Retrieval-Augmented Generation

In [None]:
import json
from qdrant_client import QdrantClient, models
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm

In [None]:
with open(r"C:\Users\peyto\Desktop\school24\497\hw3\data\cv_pairs.tsv", encoding='UTF-8') as fin:
    docs = []
    for line in fin:
        line = line.strip() 
        data = json.loads(line)
        claim = data[0]['content']
        label = data[1]['content']
        new_dict = {'claim': claim, 'label': label}
        docs.append(new_dict)


In [None]:
# need a model to generate sentence embeddings for your documents
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# create a vector database

# client = QdrantClient(":memory:")
client = QdrantClient(path="generated_db")
client.create_collection(
    collection_name="claims",
    vectors_config=models.VectorParams(
        size=model.get_sentence_embedding_dimension(),
        distance=models.Distance.COSINE
    )
)

In [None]:


# Generate embedding for the claim
# Insert into the vector database
client.upload_points(
    collection_name="claims",
    points=[
        models.PointStruct(
            id = idx,  # Unique ID for the point
            vector = model.encode(f"Claim: {doc['claim']} Label: {doc['label']}"),
            payload=doc

        )
        for idx, doc in tqdm(enumerate(docs), total=len(docs))
    ]
)


print("Claims inserted successfully!")


In [None]:
def query(text):
    hits = client.query_points(
        collection_name="claims",
        query=model.encode(text).tolist(),
        limit=10,
    ).points

    for hit in hits:
        print(hit.payload, "score:", hit.score)

In [None]:
query(input("Enter a query:"))

In [1]:
# Write a chatbot with RAG!

import ollama
import json
from qdrant_client import QdrantClient, models
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm

def query(client, model, text):
    hits = client.query_points(
        collection_name="claims",
        query=model.encode(text).tolist(),
        limit=10,
    ).points

    for hit in hits:
        print(hit.payload, "score:", hit.score)
        

    return [hit.payload for hit in hits]
    

def chat(client, model):
    messages = [
        {"role": "system", "content": "You are an expert fact checker who has read all of cv_pairs.tsv"},
    ]

    while True:
        user_input = input("Enter a fact to be checked: ")
        
        # Retrieval
        docs = query(client, model, user_input)
        # Augment
        prompt = "\n".join(f"Claim: {doc['claim']}\nLabel: {doc['label']}" for doc in docs)
        # prompt = "\n".join(doc['label'] for doc in docs)

        prompt += "\nBased on the above documents, label this query: " + user_input        
        
        messages.append({"role": "user", "content": prompt})

        # Generation
        response = ollama.chat("llama3.2:1b", messages=messages)
        
        print(response["message"]["content"])  # {"role": "assistant", "content": "..."}
        
        messages.append(response["message"])


def main():
    model = SentenceTransformer("all-MiniLM-L6-v2")
    client = QdrantClient(path="generated_db")

    chat(client, model)


In [2]:
main()



{'claim': 'Ellyse Tait Perry is a male person.', 'label': 'REFUTES'} score: 0.47602566745891617
{'claim': 'Mariah Carey is an exclusively male artist.', 'label': 'REFUTES'} score: 0.4680137115734803
{'claim': "Eric Trump, Donald Trump's eldest son, is reportedly not the President of the United States but he is one of his children.", 'label': 'SUPPORTS'} score: 0.4638487745926932
{'claim': 'Chris Hemsworth is a woman.', 'label': 'NOT ENOUGH INFO'} score: 0.4589462011100838
{'claim': "Eric Trump is Donald Trump's third son.", 'label': 'REFUTES'} score: 0.4517393666325805
{'claim': 'Jordan Spence is a woman.', 'label': 'NOT ENOUGH INFO'} score: 0.43720496912519935
{'claim': 'Donald Trump Jr. is the son of Frederick Christ Trump II, not Fred Trump.', 'label': 'REFUTES'} score: 0.42890498788595804
{'claim': 'Jennifer Aniston is a man.', 'label': 'NOT ENOUGH Info'} score: 0.42725551128422495
{'claim': "Ileana D'Cruz is a man.", 'label': 'REFUTES'} score: 0.4247643473189578
{'claim': "Eric Tr

KeyboardInterrupt: 