In [None]:
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer
from llama_parse import LlamaParse
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# Load and parse the document
documents = LlamaParse(result_type="markdown").load_data("./data/sample_document*.pdf")

# Load the embedding model
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Generate embeddings for the document
document_embeddings = embed_model.encode(documents)

# Load the LLaMA model and tokenizer
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(model_name)

def rag_pipeline(query, documents, document_embeddings, model, tokenizer):
    # Retrieve relevant documents based on the query
    query_embedding = embed_model.encode([query])
    similarities = document_embeddings @ query_embedding.T
    top_docs = [documents[i] for i in similarities.argsort(axis=0)[-5:].flatten()]

# Concatenate the top documents into a single context
context = " ".join(top_docs)

# Generate a response using the LLaMA model
inputs = tokenizer.encode(query + context, return_tensors="pt")
outputs = model.generate(inputs, max_length=512, num_return_sequences=1)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

return response

# Example usage
query = "How do I bake a chocolate cake?"
response = rag_pipeline(query, documents, document_embeddings, model, tokenizer)
print(response)