In [None]:
!pip install pymupdf gradio transformers pinecone groq sentence_transformers


In [None]:
import pymupdf
doc = pymupdf.open('pizza_description.pdf')

doc_text = ""
for page_num in range(doc.page_count):
    page = doc[page_num]
    doc_text += page.get_text()

doc.close()

print(doc_text)

In [None]:
chunk_size=50
overlap=10

# chunk_size (int): The number of words in each chunk.
# overlap (int): The number of overlapping words between consecutive chunks.

words = doc_text.split()
chunks = []

begin = 0
while begin < len(words):
    end = begin + chunk_size
    # Extract the chunk and join it back into a string
    chunk = ' '.join(words[begin:end])
    chunks.append(chunk)

    # Move to the next chunk, starting from 'chunk_size - overlap' words ahead
    begin += chunk_size - overlap

# Example usage
print(len(chunks))
print(chunks)

In [None]:
from sentence_transformers import SentenceTransformer

# model = SentenceTransformer('all-MiniLM-L6-v2') #supposed to be fast
# model = SentenceTransformer('deberta-v3-base') #supposed to be pretty good
model = SentenceTransformer('roberta-base')

embeddings = [model.encode(chunk) for chunk in chunks]
print(embeddings)


In [None]:
import os
import time
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="")

index_name = "my-index"


In [None]:
pc.delete_index(index_name)

In [None]:
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        # dimension=384,
        dimension=768, #for roberta
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

In [None]:
index = pc.Index(index_name)

records = []
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
    records.append({
        "id": f"chunk_{i}",  # Unique ID for each chunk
        "values": embedding.tolist(),  # Convert embedding to list
        "metadata": {"text": chunk}    # Attach text chunk as metadata
    })

index.upsert(vectors=records, namespace="example-namespace")
print("Embeddings uploaded successfully.")

In [None]:
from groq import Groq
import numpy as np

groq_api_key = ""
groq_client = Groq(api_key=groq_api_key)

In [None]:
def get_response_from_groq(context, query):
    messages = [
        {
            "role": "system",
            "content": f"You are a helpful assistant and will answer questions based on the following context:\n\n{context}"
        },
        {
            "role": "user",
            "content": query
        }
    ]

    response = groq_client.chat.completions.create(
        messages=messages,
        model="llama3-70b-8192"
    )

    return response.choices[0].message.content

In [None]:
def retrieve_and_answer(query, top_k=3):
    """
    Retrieve relevant chunks and generate an answer using GROQ

    Args:
        query (str): User's question
        top_k (int): Number of relevant chunks to retrieve
    """
    # Create embedding for the query
    query_embedding = model.encode(query)

    # Search in Pinecone
    search_results = index.query(
        vector=query_embedding.tolist(),
        top_k=top_k,
        namespace="example-namespace",
        include_metadata=True
    )

    relevant_chunks = [match.metadata["text"] for match in search_results.matches]
    context = "\n".join(relevant_chunks)

    answer = get_response_from_groq(context, query)

    return {
        "answer": answer,
        "relevant_chunks": relevant_chunks,
        "context": context
    }

In [None]:
query = "What is the document about?"
result = retrieve_and_answer(query)

print("Answer:", result["answer"])
print("\nRelevant chunks used:")
for i, chunk in enumerate(result["relevant_chunks"], 1):
    print(f"\nChunk {i}:")
    print(chunk)

In [None]:
import gradio as gr

def gradio_interface(query):
    result = retrieve_and_answer(query)
    return result["answer"], "\n\n".join(result["relevant_chunks"])

# Gradio interface
interface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Textbox(label="Ask a question about the document"),
    outputs=[
        gr.Textbox(label="Answer"),
        gr.Textbox(label="Retrieved Chunks")
    ],
    title="Document Q&A",
    description="Ask questions about the uploaded document and get AI-powered answers."
)
interface.launch()