In [12]:
import os
import pymupdf  # PyMuPDF for PDF extraction
import ollama
import faiss
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
print(os.getcwd())

c:\Users\rucke\OneDrive\Documents\RAGwithCitations


In [None]:


# Step 1: Function to Extract Text from PDFs
def extract_text_from_pdf(pdf_path):
    # Open the PDF file
    doc = pymupdf.open(pdf_path)
    
    # Extract text from each page
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text("text")  # Extract text from the page
    
    return text

# Example of extracting text from multiple PDFs
def extract_text_from_folder(folder_path):
    # List all files in the folder
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
    
    # Dictionary to hold the extracted text for each file
    documents = {}
    
    # Process each PDF file
    for pdf_file in pdf_files:
        pdf_path = os.path.join(folder_path, pdf_file)
        pdf_text = extract_text_from_pdf(pdf_path)
        documents[pdf_file] = pdf_text  # Store the text using the file name as the key
    
    return documents
documents = extract_text_from_folder(os.getcwd() + "/raginputtest")
doc_texts = list(documents.values())
print(doc_texts)
# Step 2: Index the documents using FAISS (same as before)
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(doc_texts).toarray()

# Create a FAISS index
dimension = X.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(X, dtype=np.float32))

# Function to retrieve the top-k relevant documents
def retrieve_documents(query, k=2):
    query_vector = vectorizer.transform([query]).toarray()
    D, I = index.search(np.array(query_vector, dtype=np.float32), k)
    
    retrieved_docs = [documents[list(documents.keys())[i]] for i in np.array(I[0])]
    return retrieved_docs

# Step 3: Define the RAG function
def rag_query(query):
    # Retrieve top-k documents based on the query
    retrieved_docs = retrieve_documents(query, k=2)
    
    # Combine the retrieved documents into a single context for the model
    context = "\n".join(retrieved_docs)
    
    # Step 4: Use Ollama model to generate a response based on the context
    model_response = ollama.chat(model="llama3.2:1b", messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": f"Context: {context}\n\nQuestion: {query}"}
    ])
    
    return model_response['message']['content']

# Example Query
query = "What is discussed in document 1 about the Toyota Grand Highlander?"
response = rag_query(query)

print(response)


There is no mention of a "Toyota Grand Highlander" in document 1. It appears to be an advertisement or technical specification document for a vehicle, but it does not specifically discuss that model.

The document seems to outline general information and settings for various features on a computer, including:

* Information displayed on the map screen
* Language settings
* Smartphones (including connecting to Wi-Fi, updating data, and using voice commands)
* Toyota account management

If you are looking for information about a specific Toyota model, such as the Highlander, I would recommend searching online or contacting a dealership for more detailed information.


: 