In [None]:
# ================= CONFIG =================

DATA_DIR = "data"                  # Folder with PDFs
PINECONE_API_KEY = "PINECONE_API_KEY"
GROQ_API_KEY = "GROQ_API_KEY"

INDEX_NAME = "rag-index"
CHUNK_SIZE = 800
CHUNK_OVERLAP = 150
EMBEDDING_DIM = 384   # all-MiniLM-L6-v2

# ========================================


In [23]:
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader

print("STEP 1: Loading PDFs...")

loader = DirectoryLoader(
    DATA_DIR,
    glob="*.pdf",
    loader_cls=PyMuPDFLoader
)

raw_documents = loader.load()
print(f"Loaded {len(raw_documents)} raw PDF pages")


STEP 1: Loading PDFs...
Loaded 144 raw PDF pages


In [24]:
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader

print("STEP 1: Loading PDFs...")

loader = DirectoryLoader(
    DATA_DIR,
    glob="*.pdf",
    loader_cls=PyMuPDFLoader
)

raw_documents = loader.load()
print(f"Loaded {len(raw_documents)} raw PDF pages")


print("\nSTEP 2: Inspecting RAW document sample\n")

raw_doc = raw_documents[0]

print("RAW PAGE CONTENT (first 300 chars):\n")
print(raw_doc.page_content[:300])

print("\nRAW METADATA:\n")
print(raw_doc.metadata)


STEP 1: Loading PDFs...
Loaded 144 raw PDF pages

STEP 2: Inspecting RAW document sample

RAW PAGE CONTENT (first 300 chars):

Graduate visa
1. Overview
A Graduate visa gives you permission to stay in the UK for at least 18
months after successfully completing an eligible course in the UK.
You must be in the UK when you apply.
Eligibility
You can apply for a Graduate visa if all of the following are true:
you’re in the UK
y

RAW METADATA:

{'producer': 'Skia/PDF m143', 'creator': 'Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Mobile Safari/537.36', 'creationdate': '2025-12-16T12:23:56+00:00', 'source': 'data\\Print Graduate visa - GOV.UK.pdf', 'file_path': 'data\\Print Graduate visa - GOV.UK.pdf', 'total_pages': 13, 'format': 'PDF 1.4', 'title': 'Print Graduate visa - GOV.UK', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-12-16T12:23:56+00:00', 'trapped': '', 'modDate': "D:20251216122356+00'00'", 'creationDate': "D:2025121612

In [25]:
print("\nRAW SAMPLE TEXT:")
print(raw_documents[0].page_content[:300])

print("\nRAW METADATA:")
print(raw_documents[0].metadata)



RAW SAMPLE TEXT:
Graduate visa
1. Overview
A Graduate visa gives you permission to stay in the UK for at least 18
months after successfully completing an eligible course in the UK.
You must be in the UK when you apply.
Eligibility
You can apply for a Graduate visa if all of the following are true:
you’re in the UK
y

RAW METADATA:
{'producer': 'Skia/PDF m143', 'creator': 'Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Mobile Safari/537.36', 'creationdate': '2025-12-16T12:23:56+00:00', 'source': 'data\\Print Graduate visa - GOV.UK.pdf', 'file_path': 'data\\Print Graduate visa - GOV.UK.pdf', 'total_pages': 13, 'format': 'PDF 1.4', 'title': 'Print Graduate visa - GOV.UK', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-12-16T12:23:56+00:00', 'trapped': '', 'modDate': "D:20251216122356+00'00'", 'creationDate': "D:20251216122356+00'00'", 'page': 0}


In [26]:
from langchain_core.documents import Document
from typing import List

print("\nSTEP 2: Cleaning metadata...")

def clean_documents(docs: List[Document]) -> List[Document]:
    return [
        Document(
            page_content=doc.page_content,
            metadata={"source": doc.metadata.get("source")}
        )
        for doc in docs
        if doc.page_content and doc.page_content.strip()
    ]

documents = clean_documents(raw_documents)
print(f"Cleaned {len(documents)} documents")



STEP 2: Cleaning metadata...
Cleaned 144 documents


In [27]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

print("\nSTEP 3: Chunking documents...")

splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)

chunks = splitter.split_documents(documents)

print(f"Chunk size     : {CHUNK_SIZE}")
print(f"Chunk overlap : {CHUNK_OVERLAP}")
print(f"Total chunks  : {len(chunks)}")



STEP 3: Chunking documents...
Chunk size     : 800
Chunk overlap : 150
Total chunks  : 368


In [28]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

print("\nSTEP 3: Chunking documents...")

splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)

chunks = splitter.split_documents(documents)

print(f"Chunk size     : {CHUNK_SIZE}")
print(f"Chunk overlap : {CHUNK_OVERLAP}")
print(f"Total chunks  : {len(chunks)}")

# -------- VIEW CHUNKS --------
N = 3  # number of chunks to preview

for i, chunk in enumerate(chunks[:N], start=1):
    print("\n" + "=" * 500)
    print(f"Chunk {i}")
    print("-" * 500)
    print(chunk.page_content)
    print("=" * 500)



STEP 3: Chunking documents...
Chunk size     : 800
Chunk overlap : 150
Total chunks  : 368

Chunk 1
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Graduate visa
1. Overview
A Graduate visa gives you permission to stay in the UK for at least 18
months after successfully completing an eligible course in the UK.
You must be in the UK when you apply.
Eligibility
You can apply for a Graduate visa if all of the following are true:
you’re in the UK
your current visa is a Student visa or Tier 4 (General) student visa
(/student-visa)
you studied a 

In [29]:
import torch
from langchain_community.embeddings import HuggingFaceEmbeddings

print("\nSTEP 4: Loading embedding model...")

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"}
)

print("Embedding model loaded")



STEP 4: Loading embedding model...
Embedding model loaded


In [30]:
from pinecone import Pinecone

print("\nSTEP 5: Initializing Pinecone...")

pc = Pinecone(api_key=PINECONE_API_KEY)

# Recreate index to avoid stale vectors
if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=INDEX_NAME,
        dimension=EMBEDDING_DIM,
        metric="cosine",
        spec={"serverless": {"cloud": "aws", "region": "us-east-1"}}
    )

index = pc.Index(INDEX_NAME)
print("Pinecone index ready")



STEP 5: Initializing Pinecone...
Pinecone index ready


In [31]:
from uuid import uuid4

print("\nSTEP 6: Uploading vectors to Pinecone...")

vectors = []

for doc in chunks:
    vector = embeddings.embed_query(doc.page_content)
    vectors.append((
        str(uuid4()),
        vector,
        {
            "source": doc.metadata.get("source"),
            "text": doc.page_content  
        }
    ))

index.upsert(vectors=vectors)
print(f"Uploaded {len(vectors)} vectors")



STEP 6: Uploading vectors to Pinecone...
Uploaded 368 vectors


In [32]:
def retrieve_context(question: str, top_k: int = 5) -> str:
    query_vector = embeddings.embed_query(question)

    results = index.query(
        vector=query_vector,
        top_k=top_k,
        include_metadata=True
    )

    contexts = []
    for match in results.get("matches", []):
        text = match["metadata"].get("text")
        if text:
            contexts.append(text.strip())

    return "\n\n".join(contexts)


In [1]:
import os
from groq import Groq


# -------------------------------------------------------------------
# Configuration
# -------------------------------------------------------------------

GROQ_API_KEY = os.getenv("GROQ_API_KEY")

if not GROQ_API_KEY:
    raise EnvironmentError(
        "GROQ_API_KEY not found. Please set it as an environment variable."
    )

client = Groq(api_key=GROQ_API_KEY)


# -------------------------------------------------------------------
# Answer Generation Function (Visa RAG)
# -------------------------------------------------------------------

def generate_answer(question: str, context: str) -> str:
    """
    Generates a visa-related answer strictly from retrieved document context.

    Args:
        question (str): User query
        context (str): Retrieved text from visa documents

    Returns:
        str: Grounded answer or safe fallback message
    """

    if not context or not context.strip():
        return "No relevant information found in the provided visa documents."

    prompt = f"""
You are a Visa Eligibility Assistant powered by document retrieval.

INSTRUCTIONS (follow strictly):
1. Answer ONLY using the information provided in the Context.
2. Do NOT use external knowledge, assumptions, or general visa rules.
3. If the answer is not present in the Context, respond exactly with:
   "The requested information is not available in the provided visa documents."
4. Keep the answer factual, concise, and professional.
5. Do NOT provide legal advice or personal recommendations.
6. Reference visa type, eligibility criteria, or required documents only if explicitly mentioned.

Context (Authoritative Source):
--------------------------------
{context}
--------------------------------

User Question:
{question}

Answer:
"""

    try:
        response = client.chat.completions.create(
            model="llama-3.1-8b-instant",
            messages=[
                {"role": "user", "content": prompt}
            ],
            temperature=0.2
        )

        return response.choices[0].message.content.strip()

    except Exception as e:
        return f"Error generating response: {str(e)}"


In [34]:
def rag_query(question: str) -> str:
    print("\n Retrieving context from Pinecone...")
    context = retrieve_context(question)

    print("Generating answer with LLM...")
    return generate_answer(question, context)


In [37]:
question = "what is student visa"

answer = rag_query(question)

print(answer)



 Retrieving context from Pinecone...
Generating answer with LLM...
A Student visa is a type of visa that allows you to study in the UK if you meet certain eligibility criteria. 

To be eligible for a Student visa, you must:
- have been offered a place on a course by a licensed student sponsor
- have enough money to support yourself and pay for your course
- be able to speak, read, write and understand English
- have consent from your parents if you’re 16 or 17

This visa has replaced the Tier 4 (General) student visa.
