PDF_JSON

In [1]:
import re
import unicodedata

def normalize_slug(s: str, repl="-"):
    if s is None:
        return ""
    s = unicodedata.normalize("NFKC", s).lower()
    # Replace any character not alnum with hyphen
    s = re.sub(r"[^a-z0-9]+", repl, s)
    # Collapse multiple hyphens
    s = re.sub(rf"{repl}{{2,}}", repl, s)
    # Trim leading/trailing hyphens
    s = s.strip(repl)
    return s or "na"


In [2]:
from transformers import AutoTokenizer

# MiniLM compatible tokenizer
tok = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

def chunk_by_tokens(text: str, max_tokens=384, overlap=64):
    ids = tok.encode(text, add_special_tokens=False)
    n = len(ids)
    start = 0
    while start < n:
        end = min(start + max_tokens, n)
        chunk_ids = ids[start:end]
        chunk_text = tok.decode(chunk_ids, skip_special_tokens=True)
        # Find the character start/end by re-encoding chunk_text if needed
        yield start, end, chunk_text
        if end == n:
            break
        start = max(end - overlap, 0)

In [3]:
def pdf_to_chunks_tokenwise(pdf_path, meta, out_path, max_tokens=256, overlap=32):
    import fitz, time, json, hashlib
    from pathlib import Path

    pdf_path = Path(pdf_path)
    if not pdf_path.exists():
        raise FileNotFoundError(f"PDF file not found: {pdf_path}")

    # SHA256
    h = hashlib.sha256()
    with open(pdf_path, "rb") as f:
        for b in iter(lambda: f.read(1 << 20), b""):
            h.update(b)
    docsha = h.hexdigest()
    meta["docsha256"] = docsha

    doc = fitz.open(str(pdf_path))
    all_chunks = []
    seq = 1

    country_slug = normalize_slug(meta["country"])
    visa_slug = normalize_slug(meta["visa_type"])
    year_slug = normalize_slug(str(meta["year"]))
    doc_slug = normalize_slug(meta["doc_slug"])

    for pagenum in range(len(doc)):
        page = doc[pagenum]
        text = page.get_text("text")
        # Normalize whitespace/control chars to spaces first
        text = unicodedata.normalize("NFKC", text)
        text = re.sub(r"\s+", " ", text).strip()

        for tstart, tend, ctext in chunk_by_tokens(text, max_tokens, overlap):
            docid = f"{country_slug}-{visa_slug}-{year_slug}-{doc_slug}"
            chunkid = f"{docid}-Pg{pagenum+1}-seq{seq:03d}"
            chunkmeta = {
                "chunkid": chunkid,
                "docid": docid,
                "source": meta.get("source"),
                "url": meta.get("url"),
                "country": meta.get("country"),
                "visa_type": meta.get("visatype"),
                "effectivedate": meta.get("effectivedate"),
                "version": meta.get("version"),
                "docsha256": docsha,
                "retrievedat": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
                "page": pagenum + 1,
                "pages": pagenum + 1,
                "sectiontitle": None,
                "language": "en",
                "token_start": int(tstart),
                "token_end": int(tend),
                "text": ctext,
            }
            all_chunks.append(chunkmeta)
            seq += 1

    out_path = Path(out_path)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as f:
        for ch in all_chunks:
            f.write(json.dumps(ch, ensure_ascii=False) + "\n")
    print(f"Saved {len(all_chunks)} chunks to {out_path}")

In [5]:
if __name__ == "__main__":
    pdf_file = r"C:\Users\HP\Downloads\Student+and+Child+Student(UK).pdf"
    output_file = r"C:\Users\HP\Desktop\Info_Spring\swiftvisa\data\processed\UK_StudentVisa_chunks.jsonl"

    meta_info = {
        "country": "UK",
        "visa_type": "Student and Child Student",
        "year": "2025",
        "doc_slug": "UK Student Visa Guide 2024",
        "source": "Student and Child Student",
        "url": "https://gov.uk/student-visa",
        "effective_date": "2025-07-16",
        "version": "11.0"
    }

    pdf_to_chunks_tokenwise(pdf_file, meta_info, output_file)

Saved 253 chunks to C:\Users\HP\Desktop\Info_Spring\swiftvisa\data\processed\UK_StudentVisa_chunks.jsonl


JSON_EMBEDDINGS

In [6]:
from sentence_transformers import SentenceTransformer
import json
from pathlib import Path
import numpy as np

In [8]:
jsonl_path = r"C:\Users\HP\Desktop\Info_Spring\swiftvisa\data\processed\UK_StudentVisa_chunks.jsonl"

chunks = []

with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        chunks.append(json.loads(line))

print(f"Loaded {len(chunks)} chunks.")


Loaded 253 chunks.


In [9]:
# Initialize the embedding model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [11]:
# Generate embeddings
for chunk in chunks:
    chunk['embedding'] = model.encode(chunk['text']).tolist()  # convert to list for JSON

# Save the chunks with embeddings
with open(r"C:\Users\HP\Desktop\Info_Spring\swiftvisa\index\chunks_with_embeddings.jsonl", "w", encoding="utf-8") as f:
    for chunk in chunks:
        f.write(json.dumps(chunk) + "\n")

print("Embeddings created and saved successfully!")

Embeddings created and saved successfully!


RAG PIPILINE

In [12]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import json
from transformers import pipeline

In [13]:
# 1) Build FAISS index with cosine behavior
def build_faiss_index(chunks_path):
    # Load model and chunks
    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    chunks = []
    with open(chunks_path, "r", encoding="utf-8") as f:
        for line in f:
            chunks.append(json.loads(line))

    # Collect embeddings and normalize to unit length
    embeddings = np.array([chunk["embedding"] for chunk in chunks], dtype="float32")
    faiss.normalize_L2(embeddings)  # in-place normalization to unit norm

    # Use inner product index (dot product == cosine on unit vectors)
    d = embeddings.shape[1]
    index = faiss.IndexFlatIP(d)
    index.add(embeddings)
    return model, index, chunks



In [14]:
# 2) Retrieval with normalized query
def retrieve_chunks(query, model, index, chunks, top_k=5):
    q = model.encode(query).astype("float32")
    faiss.normalize_L2(q.reshape(1, -1))  # normalize query
    D, I = index.search(q.reshape(1, -1), top_k)
    results = [chunks[i] for i in I[0]]
    return results


In [15]:
# -----------------------------
# 3. Load generator model
# -----------------------------
generator = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    tokenizer="google/flan-t5-base",
    device=-1  # CPU; use 0 for GPU
)


Device set to use cpu


In [16]:
# -----------------------------
# 4. Answer generation
# -----------------------------
def generate_answer(query, model, index, chunks, top_k=5):
    
    # Step 1: Retrieve
    retrieved = retrieve_chunks(query, model, index, chunks, top_k=top_k)

    # Step 2: Prepare context
    context = "\n\n".join([f"Source: {r.get('source', 'Unknown')}\n{r['text']}" for r in retrieved])

    # Step 3: Build prompt
    prompt = f"""
    You are an expert eligibility officer.
    Using only the context below, answer the question truthfully.
    If the answer is not in the context, say "I cannot find relevant information."

    Context:
    {context}

    Question: {query}
    Answer:
    """

    # Step 4: Generate
    output = generator(prompt, max_new_tokens=200)

    # Step 5: Collect citations
    #citations = list({r.get('source', 'Unknown') for r in retrieved})
    citations = [f"{r.get('source', 'Unknown')} — {r['text']}" for r in retrieved]

    return output[0]["generated_text"], citations



In [17]:
# -----------------------------
# 5. Run everything
# -----------------------------
chunks_path = r"C:\Users\HP\Desktop\Info_Spring\swiftvisa\index\chunks_with_embeddings.jsonl"
model, index, chunks = build_faiss_index(chunks_path)

query = "Is the student eligible for UK Student Visa?"
answer, citations = generate_answer(query, model, index, chunks, top_k=5)

print("\nFinal Answer:\n", answer)

print("\nCitations:\n", citations)

Token indices sequence length is longer than the specified maximum sequence length for this model (1561 > 512). Running this sequence through the model will result in indexing errors



Final Answer:
 Yes

Citations:
 ['Student and Child Student — page 94 of 107 published for home office staff on 16 july 2025 work conditions this page tells caseworkers what employment a student or child student can undertake in the uk. employment conditions are dependent on the type of sponsor the applicant will be studying at and the level of course they are studying, information on the amount of hours a student can work and the type of employment that is permitted can be found in appendix student st 26. periods of permission after a student has completed their course are considered to be outside of term - time for the purposes of any work conditions. type of sponsor course type level work permitted if study is at : • a higher education provider ( hep ) with a track record of compliance • overseas higher education institution and the student is on a short - term study - abroad programme in the uk full - time course at degree level or above then the following work is permitted : • pa

In [33]:
chunks_path = r"C:\Users\HP\Desktop\Info_Spring\swiftvisa\index\chunks_with_embeddings.jsonl"
model, index, chunks = build_faiss_index(chunks_path)

query = "I am from China and applying for a UK Student Visa. Do I need to prove my English language ability?"
answer, citations = generate_answer(query, model, index, chunks, top_k=5)

print("\nFinal Answer:\n", answer)
print("\nCitations:\n", citations)


Final Answer:
 Here is what I found based on the documents:

Source: Student and Child Student
• an interview • the sponsor ’ s own test or entrance exam the sponsor must, however, provide details of how they assessed the applicant on the cas. evidence of english language ability all sponsors must assess their prospective students ’ english language ability. the level of english required and the documents to be submitted depends on : • the level of course the applicant is studying • the type of institution at which the applicant is studying verifying documents the caseworker must carry out a verification check if : • they have reasonable doubts that a specified document is not genuine

Source: Student and Child Student
page 48 of 107 published for home office staff on 16 july 2025 student : english language requirement this page tells caseworkers about the english language requirement for applicants on the student route. applicants on the student route must meet a required level of en

In [22]:
# -----------------------------
# 4. Answer generation
# -----------------------------
def generate_answer(query, model, index, chunks, top_k=5, generator=None):
    """
    Generate an answer for the query.
    If generator (LLM) is provided, use it to synthesize the answer.
    Otherwise, fall back to concatenated retrieved chunks.
    """
    # Step 1: Retrieve top chunks
    retrieved = retrieve_chunks(query, model, index, chunks, top_k=top_k)
    if not retrieved:
        return "I cannot find relevant information.", []

    # Step 2: Prepare context
    context = "\n\n".join([f"Source: {r.get('source', 'Unknown')}\n{r['text']}" for r in retrieved])
    citations = [f"{r.get('source', 'Unknown')} — {r['text']}" for r in retrieved]

    # --- Mode 1: Use LLM if provided ---
    if generator is not None:
        prompt = f"""
        You are an expert eligibility officer.
        Using only the context below, answer the question truthfully.
        If the answer is not in the context, say "I cannot find relevant information."

        Context:
        {context}

        Question: {query}
        Answer:
        """
        output = generator(prompt, max_new_tokens=200)
        return output[0]["generated_text"].strip(), citations

    # --- Mode 2: Fallback if no generator ---
    answer = f"Here is what I found based on the documents:\n\n{context}"
    return answer, citations


In [23]:
chunks_path = r"C:\Users\HP\Desktop\Info_Spring\swiftvisa\index\chunks_with_embeddings.jsonl"
model, index, chunks = build_faiss_index(chunks_path)

query = "What are the eligibility requirements for a UK Student Visa?"
answer, citations = generate_answer(query, model, index, chunks, top_k=5)

print("\nFinal Answer:\n", answer)
print("\nCitations:\n", citations)


Final Answer:
 Here is what I found based on the documents:

Source: Student and Child Student
page 9 of 107 published for home office staff on 16 july 2025 validity for entry clearance and permission to stay applications this page tells caseworkers where to find the validity requirements that an applicant must meet when they apply for entry clearance or permission to stay as a student, a child student or a dependant of a student. before considering any application, the caseworker must check the application is valid by referring to : • the validity requirements for the student route, contained in appendix student st 1. 1 to 1. 5 • the validity requirements for the child student route, contained in appendix child student cs 1. 1 to 1. 5 • the validity requirements for dependants of a student, contained in appendix student st 28. 1 to 28. 4 detailed guidance on how to assess the validity requirements can be found in the validation, variation, voiding and withdrawing of applications guid

In [24]:
import numpy as np

# -----------------------------
# 2) Retrieval with threshold
# -----------------------------
def retrieve_chunks(query, model, index, chunks, top_k=5, min_score=0.3):
    """
    Retrieve top chunks for a query from FAISS index.
    Only return chunks with similarity >= min_score.
    """
    # Encode query
    q_emb = model.encode(query, convert_to_numpy=True, normalize_embeddings=True).astype("float32")
    faiss.normalize_L2(q_emb.reshape(1, -1))
    
    # Search
    D, I = index.search(q_emb.reshape(1, -1), top_k)
    
    results = []
    for score, idx in zip(D[0], I[0]):
        if score >= min_score:
            results.append(chunks[idx])
    return results

In [25]:

# -----------------------------
# 4) Answer generation
# -----------------------------
def generate_answer(query, model, index, chunks, top_k=5, generator=None, min_score=0.3):
    """
    Generate an answer for the query using retrieved chunks.
    - Uses generator if provided.
    - Falls back to concatenated chunks if generator is None.
    - Returns "No relevant information" if nothing passes similarity threshold.
    """
    # Step 1: Retrieve top chunks with threshold
    retrieved = retrieve_chunks(query, model, index, chunks, top_k=top_k, min_score=min_score)
    if not retrieved:
        return "No relevant information found.", []

    # Step 2: Prepare context and citations
    context = "\n\n".join([f"Source: {r.get('source', 'Unknown')}\n{r['text']}" for r in retrieved])
    citations = [f"{r.get('source', 'Unknown')} — {r['text']}" for r in retrieved]

    # Step 3: Use generator if available
    if generator is not None:
        prompt = f"""
        You are an expert eligibility officer.
        Using only the context below, answer the question truthfully.
        If the answer is not in the context, say "I cannot find relevant information."

        Context:
        {context}

        Question: {query}
        Answer:
        """
        output = generator(prompt, max_new_tokens=200)
        return output[0]["generated_text"].strip(), citations

    # Step 4: Fallback → concatenated chunks
    answer = f"Here is what I found based on the documents:\n\n{context}"
    return answer, citations


In [28]:
chunks_path = r"C:\Users\HP\Desktop\Info_Spring\swiftvisa\index\chunks_with_embeddings.jsonl"
model, index, chunks = build_faiss_index(chunks_path)

query = "How to clean my system?"
answer, citations = generate_answer(query, model, index, chunks, top_k=5)

print("\nFinal Answer:\n", answer)
print("\nCitations:\n", citations)


Final Answer:
 No relevant information found.

Citations:
 []


In [29]:
chunks_path = r"C:\Users\HP\Desktop\Info_Spring\swiftvisa\index\chunks_with_embeddings.jsonl"
model, index, chunks = build_faiss_index(chunks_path)

query = "What documents are generally required for a visa application?"
answer, citations = generate_answer(query, model, index, chunks, top_k=5)

print("\nFinal Answer:\n", answer)
print("\nCitations:\n", citations)


Final Answer:
 Here is what I found based on the documents:

Source: Student and Child Student
................... 82 mandatory documents.......................................................................................... 82

Source: Student and Child Student
arrangements, all student and child student applicants must still submit the following with their application : • their passport or other travel document proving identity and nationality • biometric residence permit ( if applicable ) • cas reference number student and child student applicants must provide an academic technology approval scheme ( atas ) clearance certificate, if required. student and child student applicants must also provide a valid tuberculosis screening certificate, if required.

Source: Student and Child Student
• an interview • the sponsor ’ s own test or entrance exam the sponsor must, however, provide details of how they assessed the applicant on the cas. evidence of english language ability all spons