In [15]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import json
from transformers import pipeline

In [16]:
# 1) Build FAISS index with cosine behavior
def build_faiss_index(chunks_path):
    # Load model and chunks
    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    chunks = []
    with open(chunks_path, "r", encoding="utf-8") as f:
        for line in f:
            chunks.append(json.loads(line))

    # Collect embeddings and normalize to unit length
    embeddings = np.array([chunk["embedding"] for chunk in chunks], dtype="float32")
    faiss.normalize_L2(embeddings)  # in-place normalization to unit norm

    # Use inner product index (dot product == cosine on unit vectors)
    d = embeddings.shape[1]
    index = faiss.IndexFlatIP(d)
    index.add(embeddings)
    return model, index, chunks

In [17]:
# 2) Retrieval with normalized query
def retrieve_chunks(query, model, index, chunks, top_k=5):
    q = model.encode(query).astype("float32")
    faiss.normalize_L2(q.reshape(1, -1))  # normalize query
    D, I = index.search(q.reshape(1, -1), top_k)
    results = [chunks[i] for i in I[0]]
    return results

In [18]:
# -----------------------------
# 3. Load generator model
# -----------------------------
generator = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    tokenizer="google/flan-t5-base",
    device=-1,  # CPU; use 0 for GPU
)

Device set to use cpu


In [19]:
# -----------------------------
# 4. Answer generation
# -----------------------------
def generate_answer(query, model, index, chunks, top_k=5):
    # Step 1: Retrieve
    retrieved = retrieve_chunks(query, model, index, chunks, top_k=top_k)

    # Step 2: Prepare context
    context = "\n\n".join(
        [f"Source: {r.get('source', 'Unknown')}\n{r['text']}" for r in retrieved]
    )

    # Step 3: Build prompt
    prompt = f"""
    You are an expert eligibility officer.
    Using only the context below, answer the question truthfully.
    If the answer is not in the context, say "I cannot find relevant information."

    Context:
    {context}

    Question: {query}
    Answer:
    """

    # Step 4: Generate
    output = generator(prompt, max_new_tokens=200)

    # Step 5: Collect citations
    # citations = list({r.get('source', 'Unknown') for r in retrieved})
    citations = [f"{r.get('source', 'Unknown')} — {r['text']}" for r in retrieved]

    return output[0]["generated_text"], citations


In [20]:
# -----------------------------
# 5. Run everything
# -----------------------------
chunks_path = "data/chunks_with_embeddings.jsonl"
model, index, chunks = build_faiss_index(chunks_path)

query = "Is the student eligible for UK Student Visa?"
answer, citations = generate_answer(query, model, index, chunks, top_k=5)

print("\nFinal Answer:\n", answer)

print("\nCitations:\n", citations)

Token indices sequence length is longer than the specified maximum sequence length for this model (2034 > 512). Running this sequence through the model will result in indexing errors



Final Answer:
 Yes

Citations:
 ['Student and Child Student — Page 47 of 75  Published for Home Office staff on 22 July 2025 \n \nOption New entrant criteria \nInformation required from sponsor \nThe sponsor must confirm which \nrecognised professional qualification \nthe applicant is working towards – \nthis must be a UK qualification \naccepted by the regulatory body for \nthe profession. \n4. \nApplicant is working towards full \nregistration or chartered status \nwith the relevant professional \nbody for the job they are \nsponsored for. \n \nThe sponsor must confirm who the \nrelevant professional body is and the \napplicant is working towards full \nregistration or chartered status.  If \nnecessary, you can check the list of \nchartered bodies from the Privy \nCouncil. \n5. \nApplicant’s most recent \npermission, other than as a \nvisitor, was under Tier 4 \n(General) or the Student route. \n \nThe permission must either be \ncurrent or have expired less than \n2 years before th

In [21]:
chunks_path = "data/chunks_with_embeddings.jsonl"
model, index, chunks = build_faiss_index(chunks_path)

query = "What are the eligibility requirements for a UK Student Visa?"
answer, citations = generate_answer(query, model, index, chunks, top_k=5)

print("\nFinal Answer:\n", answer)
print("\nCitations:\n", citations)


Final Answer:
 a UK bachelor's degree • a UK master's degree • a UK PhD or other doctoral qualification • a UK PhD or other doctoral qualification • a Postgraduate Certificate in Education • a Professional Graduate Diploma of Education The applicant must have completed (or be applying no more than 3 months before they are expected to complete) the course. An exception applies to PhDs and other doctoral qualifications, where the applicant must have completed at least 12 months’ study in the UK towards a PhD or other doctoral qualification. Confirmation (from the applicant or their sponsor) the applicant has completed (or be applying no more than 3 months before they are expected to complete) the course. An exception applies to PhDs and other doctoral qualifications, where the applicant must have completed at least 12 months’ study in the UK towards a PhD or other doctoral qualification. Confirmation (from the applicant or their sponsor) the applicant has completed (or

Citations:
 ['St

In [22]:
chunks_path = "data/chunks_with_embeddings.jsonl"
model, index, chunks = build_faiss_index(chunks_path)

query = "I am from Canada and applying for a UK Student Visa. Do I need to prove my English language ability?"
answer, citations = generate_answer(query, model, index, chunks, top_k=5)

print("\nFinal Answer:\n", answer)
print("\nCitations:\n", citations)


Final Answer:
 Yes

Citations:
 ['Student and Child Student — Page 28 of 75  Published for Home Office staff on 22 July 2025 \n \nEnglish language \nThis section explains how to assess the English language requirement for Skilled \nWorkers. \n \nThe applicant must score 10 points for English language skills equivalent to level B1 \nof the Common European Framework of References for English language in all 4 \ncomponents (reading, writing, speaking and listening). To award these points, you \nmust be satisfied the application meets the requirements in paragraphs SW 7.1. to \nSW 7.4. \n \nTo assess whether the requirement is met, you should refer to the English language \nguidance. Non-POISE see: English language guidance. \n \nRelated content \nContents \nMandatory points requirement', 'Student and Child Student — Page 10 of 75  Published for Home Office staff on 22 July 2025 \n \nVerifying documents \nYou must conduct verification checks if you have any doubts about whether the \nsupp

In [23]:
# -----------------------------
# 4. Answer generation
# -----------------------------
def generate_answer(query, model, index, chunks, top_k=5, generator=None):
    """
    Generate an answer for the query.
    If generator (LLM) is provided, use it to synthesize the answer.
    Otherwise, fall back to concatenated retrieved chunks.
    """
    # Step 1: Retrieve top chunks
    retrieved = retrieve_chunks(query, model, index, chunks, top_k=top_k)
    if not retrieved:
        return "I cannot find relevant information.", []

    # Step 2: Prepare context
    context = "\n\n".join(
        [f"Source: {r.get('source', 'Unknown')}\n{r['text']}" for r in retrieved]
    )
    citations = [f"{r.get('source', 'Unknown')} — {r['text']}" for r in retrieved]

    # --- Mode 1: Use LLM if provided ---
    if generator is not None:
        prompt = f"""
        You are an expert eligibility officer.
        Using only the context below, answer the question truthfully.
        If the answer is not in the context, say "I cannot find relevant information."

        Context:
        {context}

        Question: {query}
        Answer:
        """
        output = generator(prompt, max_new_tokens=200)
        return output[0]["generated_text"].strip(), citations

    # --- Mode 2: Fallback if no generator ---
    answer = f"Here is what I found based on the documents:\n\n{context}"
    return answer, citations


In [24]:
chunks_path = "data/chunks_with_embeddings.jsonl"
model, index, chunks = build_faiss_index(chunks_path)

query = "What are the eligibility requirements for a UK Student Visa?"
answer, citations = generate_answer(query, model, index, chunks, top_k=5)

print("\nFinal Answer:\n", answer)
print("\nCitations:\n", citations)


Final Answer:
 Here is what I found based on the documents:

Source: Student and Child Student
Page 47 of 75  Published for Home Office staff on 22 July 2025 
 
Option New entrant criteria 
Information required from sponsor 
The sponsor must confirm which 
recognised professional qualification 
the applicant is working towards – 
this must be a UK qualification 
accepted by the regulatory body for 
the profession. 
4. 
Applicant is working towards full 
registration or chartered status 
with the relevant professional 
body for the job they are 
sponsored for. 
 
The sponsor must confirm who the 
relevant professional body is and the 
applicant is working towards full 
registration or chartered status.  If 
necessary, you can check the list of 
chartered bodies from the Privy 
Council. 
5. 
Applicant’s most recent 
permission, other than as a 
visitor, was under Tier 4 
(General) or the Student route. 
 
The permission must either be 
current or have expired less than 
2 years before t

In [25]:
# -----------------------------
# 2) Retrieval with threshold
# -----------------------------
def retrieve_chunks(query, model, index, chunks, top_k=5, min_score=0.3):
    """
    Retrieve top chunks for a query from FAISS index.
    Only return chunks with similarity >= min_score.
    """
    # Encode query
    q_emb = model.encode(
        query, convert_to_numpy=True, normalize_embeddings=True
    ).astype("float32")
    faiss.normalize_L2(q_emb.reshape(1, -1))

    # Search
    D, I = index.search(q_emb.reshape(1, -1), top_k)

    results = []
    for score, idx in zip(D[0], I[0]):
        if score >= min_score:
            results.append(chunks[idx])
    return results

In [26]:
# -----------------------------
# 4) Answer generation
# -----------------------------
def generate_answer(
    query, model, index, chunks, top_k=5, generator=None, min_score=0.3
):
    """
    Generate an answer for the query using retrieved chunks.
    - Uses generator if provided.
    - Falls back to concatenated chunks if generator is None.
    - Returns "No relevant information" if nothing passes similarity threshold.
    """
    # Step 1: Retrieve top chunks with threshold
    retrieved = retrieve_chunks(
        query, model, index, chunks, top_k=top_k, min_score=min_score
    )
    if not retrieved:
        return "No relevant information found.", []

    # Step 2: Prepare context and citations
    context = "\n\n".join(
        [f"Source: {r.get('source', 'Unknown')}\n{r['text']}" for r in retrieved]
    )
    citations = [f"{r.get('source', 'Unknown')} — {r['text']}" for r in retrieved]

    # Step 3: Use generator if available
    if generator is not None:
        prompt = f"""
        You are an expert eligibility officer.
        Using only the context below, answer the question truthfully.
        If the answer is not in the context, say "I cannot find relevant information."

        Context:
        {context}

        Question: {query}
        Answer:
        """
        output = generator(prompt, max_new_tokens=200)
        return output[0]["generated_text"].strip(), citations

    # Step 4: Fallback → concatenated chunks
    answer = f"Here is what I found based on the documents:\n\n{context}"
    return answer, citations


In [27]:
chunks_path = "data/chunks_with_embeddings.jsonl"
model, index, chunks = build_faiss_index(chunks_path)

query = "How to clean my room?"
answer, citations = generate_answer(query, model, index, chunks, top_k=5)

print("\nFinal Answer:\n", answer)
print("\nCitations:\n", citations)


Final Answer:
 No relevant information found.

Citations:
 []


In [28]:
chunks_path = "data/chunks_with_embeddings.jsonl"
model, index, chunks = build_faiss_index(chunks_path)

query = "What are the eligibility requirements for a UK Student Visa?"
answer, citations = generate_answer(query, model, index, chunks, top_k=5)

print("\nFinal Answer:\n", answer)
print("\nCitations:\n", citations)


Final Answer:
 Here is what I found based on the documents:

Source: Student and Child Student
Page 47 of 75  Published for Home Office staff on 22 July 2025 
 
Option New entrant criteria 
Information required from sponsor 
The sponsor must confirm which 
recognised professional qualification 
the applicant is working towards – 
this must be a UK qualification 
accepted by the regulatory body for 
the profession. 
4. 
Applicant is working towards full 
registration or chartered status 
with the relevant professional 
body for the job they are 
sponsored for. 
 
The sponsor must confirm who the 
relevant professional body is and the 
applicant is working towards full 
registration or chartered status.  If 
necessary, you can check the list of 
chartered bodies from the Privy 
Council. 
5. 
Applicant’s most recent 
permission, other than as a 
visitor, was under Tier 4 
(General) or the Student route. 
 
The permission must either be 
current or have expired less than 
2 years before t