# Retrieval-Augmented Generation

In [None]:
import json
from qdrant_client import QdrantClient, models
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm

In [None]:
with open(r"C:\Users\peyto\Desktop\school24\497\hw3\data\fever_train.jsonl", encoding='UTF-8') as fin:
    docs = []
    for line in fin:
        line = line.strip() 
        data = json.loads(line)


        docs.append(data)



In [None]:
# need a model to generate sentence embeddings for your documents
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# create a vector database

# client = QdrantClient(":memory:")
client = QdrantClient(path="fever_db")
client.create_collection(
    collection_name="claims",
    vectors_config=models.VectorParams(
        size=model.get_sentence_embedding_dimension(),
        distance=models.Distance.COSINE
    )
)

In [None]:
# Generate embedding for the claim
# Insert into the vector database
client.upload_points(
    collection_name="claims",
    points=[
        models.PointStruct(
            id = idx,  # Unique ID for the point
            vector = model.encode(f"Claim: {doc['claim']} Label: {doc['label']} Verifiable: {doc['verifiable']} Evidence: {doc['evidence']}"),
            payload=doc  # Metadata (e.g., label)
            #payload = {"claim": doc['claim'], "label": doc['label'], "verifiable": doc['verifiable'], "evidence": doc['evidence']}


        )
        for idx, doc in tqdm(enumerate(docs), total=len(docs))
    ]
)


print("Claims inserted successfully!")

In [None]:
def query(text):
    hits = client.query_points(
        collection_name="claims",
        query=model.encode(text).tolist(),
        limit=10,
    ).points

    for hit in hits:
        print(hit.payload, "score:", hit.score)

In [None]:
query(input("Enter a query:"))

In [1]:
# Write a chatbot with RAG!

import ollama
import json
from qdrant_client import QdrantClient, models
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm

def query(client, model, text):
    hits = client.query_points(
        collection_name="claims",
        query=model.encode(text).tolist(),
        limit=10,
    ).points

    for hit in hits:
        print(hit.payload, "score:", hit.score)
        

    return [hit.payload for hit in hits]
    

def chat(client, model):
    messages = [
        {"role": "system", "content": "You are an expert fact checker who has read all of cv_pairs_fever.tsv"},
    ]

    while True:
        user_input = input("Enter a fact to be checked: ")
        
        # Retrieval
        docs = query(client, model, user_input)
        # Augment
        prompt = "\n".join(f"Claim: {doc['claim']} Label: {doc['label']} Verifiable: {doc['verifiable']} Evidence: {doc['evidence']}" for doc in docs)

        prompt += "\nBased on the above documents, label this query: " + user_input        
        
        messages.append({"role": "user", "content": prompt})

        # Generation
        response = ollama.chat("llama3.2:1b", messages=messages)
        
        print(response["message"]["content"])  # {"role": "assistant", "content": "..."}
        
        messages.append(response["message"])


def main():
    model = SentenceTransformer("all-MiniLM-L6-v2")
    client = QdrantClient(path="fever_db")

    chat(client, model)


In [None]:
main()



{'id': 30612, 'verifiable': 'VERIFIABLE', 'label': 'REFUTES', 'claim': 'Saif Ali Khan is not male.', 'evidence': [[[46775, 55796, 'Saif_Ali_Khan', 0]], [[46775, 55797, 'Saif_Ali_Khan', 1]]]} score: 0.5886017212242541
{'id': 30611, 'verifiable': 'VERIFIABLE', 'label': 'SUPPORTS', 'claim': 'Saif Ali Khan is male.', 'evidence': [[[46776, 55798, 'Saif_Ali_Khan', 1]], [[46776, 55799, 'Saif_Ali_Khan', 6]]]} score: 0.5614247081758342
{'id': 65904, 'verifiable': 'VERIFIABLE', 'label': 'SUPPORTS', 'claim': 'Saif Ali Khan is of male gender.', 'evidence': [[[82479, 94259, 'Saif_Ali_Khan', 1]], [[82479, 94260, 'Saif_Ali_Khan', 2]], [[82479, 94261, 'Saif_Ali_Khan', 3]], [[82479, 94262, 'Saif_Ali_Khan', 6]], [[82479, 94263, 'Saif_Ali_Khan', 7]]]} score: 0.5587335883809785
{'id': 12299, 'verifiable': 'VERIFIABLE', 'label': 'SUPPORTS', 'claim': 'Leonardo DiCaprio is male.', 'evidence': [[[27591, 33791, 'Leonardo_DiCaprio', 1]], [[27591, 33792, 'Leonardo_DiCaprio', 2]], [[27591, 33793, 'Leonardo_DiCapr