# Visa Eligibility Screening Agent

### Importing Libraries

In [2]:
import re
import unicodedata
from sentence_transformers import SentenceTransformer
import json
from pathlib import Path
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline
from transformers import AutoTokenizer

### PDF to JSON format

In [3]:
def normalize_slug(s: str, repl="-"):
    if s is None:
        return ""
    s = unicodedata.normalize("NFKC", s).lower()
    # Replace any character not alnum with hyphen
    s = re.sub(r"[^a-z0-9]+", repl, s)
    # Collapse multiple hyphens
    s = re.sub(rf"{repl}{{2,}}", repl, s)
    # Trim leading/trailing hyphens
    s = s.strip(repl)
    return s or "na"

In [4]:
# MiniLM compatible tokenizer
tok = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

def chunk_by_tokens(text: str, max_tokens=384, overlap=64):
    ids = tok.encode(text, add_special_tokens=False)
    n = len(ids)
    start = 0
    while start < n:
        end = min(start + max_tokens, n)
        chunk_ids = ids[start:end]
        chunk_text = tok.decode(chunk_ids, skip_special_tokens=True)
        # Find the character start/end by re-encoding chunk_text if needed
        yield start, end, chunk_text
        if end == n:
            break
        start = max(end - overlap, 0)

In [5]:
def pdf_to_chunks_tokenwise(pdf_path, meta, out_path, max_tokens=256, overlap=32):
    import fitz, time, json, hashlib
    from pathlib import Path

    pdf_path = Path(pdf_path)
    if not pdf_path.exists():
        raise FileNotFoundError(f"PDF file not found: {pdf_path}")

    # SHA256
    h = hashlib.sha256()
    with open(pdf_path, "rb") as f:
        for b in iter(lambda: f.read(1 << 20), b""):
            h.update(b)
    docsha = h.hexdigest()
    meta["docsha256"] = docsha

    doc = fitz.open(str(pdf_path))
    all_chunks = []
    seq = 1

    country_slug = normalize_slug(meta["country"])
    visa_slug = normalize_slug(meta["visa_type"])
    year_slug = normalize_slug(str(meta["year"]))
    doc_slug = normalize_slug(meta["doc_slug"])

    for pagenum in range(len(doc)):
        page = doc[pagenum]
        text = page.get_text("text")
        # Normalize whitespace/control chars to spaces first
        text = unicodedata.normalize("NFKC", text)
        text = re.sub(r"\s+", " ", text).strip()

        for tstart, tend, ctext in chunk_by_tokens(text, max_tokens, overlap):
            docid = f"{country_slug}-{visa_slug}-{year_slug}-{doc_slug}"
            chunkid = f"{docid}-Pg{pagenum+1}-seq{seq:03d}"
            chunkmeta = {
                "chunkid": chunkid,
                "docid": docid,
                "source": meta.get("source"),
                "url": meta.get("url"),
                "country": meta.get("country"),
                "visa_type": meta.get("visatype"),
                "effectivedate": meta.get("effectivedate"),
                "version": meta.get("version"),
                "docsha256": docsha,
                "retrievedat": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
                "page": pagenum + 1,
                "pages": pagenum + 1,
                "sectiontitle": None,
                "language": "en",
                "token_start": int(tstart),
                "token_end": int(tend),
                "text": ctext,
            }
            all_chunks.append(chunkmeta)
            seq += 1

    out_path = Path(out_path)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as f:
        for ch in all_chunks:
            f.write(json.dumps(ch, ensure_ascii=False) + "\n")
    print(f"Saved {len(all_chunks)} chunks to {out_path}")

In [7]:
if __name__ == "__main__":
    pdf_file = r"C:\Users\Anvitha\OneDrive\Documents\UK.pdf"
    output_file = r"C:\Users\Anvitha\OneDrive\Documents\UK_StudentVisa_chunks.jsonl"

    meta_info = {
        "country": "UK",
        "visa_type": "Student and Child Student",
        "year": "2025",
        "doc_slug": "UK Student Visa Guide 2024",
        "source": "Student and Child Student",
        "url": "https://gov.uk/student-visa",
        "effective_date": "2025-07-16",
        "version": "11.0"
    }

    pdf_to_chunks_tokenwise(pdf_file, meta_info, output_file)

Saved 181 chunks to C:\Users\Anvitha\OneDrive\Documents\UK_StudentVisa_chunks.jsonl


### JSON Embeddings 

In [8]:
jsonl_path = r"C:\Users\Anvitha\OneDrive\Documents\UK_StudentVisa_chunks.jsonl"

chunks = []

with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        chunks.append(json.loads(line))

print(f"Loaded {len(chunks)} chunks.")


Loaded 181 chunks.


In [9]:
# Initialize the embedding model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [11]:
# Generate embeddings
for chunk in chunks:
    chunk['embedding'] = model.encode(chunk['text']).tolist()  # convert to list for JSON

# Save the chunks with embeddings
with open(r"C:\Users\Anvitha\OneDrive\Documents\chunks_with_embeddings.jsonl", "w", encoding="utf-8") as f:
    for chunk in chunks:
        f.write(json.dumps(chunk) + "\n")

print("Embeddings created and saved successfully")

Embeddings created and saved successfully


### RAG Pipeline

In [13]:
# Build FAISS index with cosine behavior
def build_faiss_index(chunks_path):
    # Load model and chunks
    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    chunks = []
    with open(chunks_path, "r", encoding="utf-8") as f:
        for line in f:
            chunks.append(json.loads(line))

    # Collect embeddings and normalize to unit length
    embeddings = np.array([chunk["embedding"] for chunk in chunks], dtype="float32")
    faiss.normalize_L2(embeddings)  # in-place normalization to unit norm

    # Use inner product index (dot product == cosine on unit vectors)
    d = embeddings.shape[1]
    index = faiss.IndexFlatIP(d)
    index.add(embeddings)
    return model, index, chunks



In [16]:
#Retrieval with normalized query
def retrieve_chunks(query, model, index, chunks, top_k=5, min_score=0.3):
    """
    Retrieve top chunks for a query from FAISS index.
    Only return chunks with similarity >= min_score.
    """
    # Encode query
    q_emb = model.encode(query, convert_to_numpy=True, normalize_embeddings=True).astype("float32")
    faiss.normalize_L2(q_emb.reshape(1, -1))
    
    # Search
    D, I = index.search(q_emb.reshape(1, -1), top_k)
    
    results = []
    for score, idx in zip(D[0], I[0]):
        if score >= min_score:
            results.append(chunks[idx])
    return results

In [17]:
# Load generator model
generator = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    tokenizer="google/flan-t5-base",
    device=-1  
)


Device set to use cpu


In [18]:
# Answer generation
def generate_answer(query, model, index, chunks, top_k=5, generator=None, min_score=0.3):
    """
    Generate an answer for the query using retrieved chunks.
    - Uses generator if provided.
    - Falls back to concatenated chunks if generator is None.
    - Returns "No relevant information" if nothing passes similarity threshold.
    """
    
    # Retrieve top chunks with threshold
    retrieved = retrieve_chunks(query, model, index, chunks, top_k=top_k, min_score=min_score)
    if not retrieved:
        return "No relevant information found.", []
        

    # Prepare context and citations
    context = "\n\n".join([f"Source: {r.get('source', 'Unknown')}\n{r['text']}" for r in retrieved])
    citations = [f"{r.get('source', 'Unknown')} — {r['text']}" for r in retrieved]
    

    # Use generator if available
    if generator is not None:
        prompt = f"""
        You are a highly knowledgeable and formal Visa Eligibility Screening Assistant.Your role is to assess visa eligibility strictly
        based on the provided policy documents, country regulations, and user details. Use only verified information from the supplied documents.
        Avoid assumptions unless logically necessary, and always mention them clearly."

        Context:
        {context}

        Question: {query}
        Answer:
        """
        output = generator(prompt, max_new_tokens=200)
        return output[0]["generated_text"].strip(), citations

        

    # Fallback → concatenated chunks
    answer = f"Here is what I found based on the documents:\n\n{context}"
    return answer, citations


In [None]:
### E

In [20]:
chunks_path =r"C:\Users\Anvitha\OneDrive\Documents\chunks_with_embeddings.jsonl"
model, index, chunks = build_faiss_index(chunks_path)

query = "What are the eligibility requirements for a UK Student Visa?"
answer, citations = generate_answer(query, model, index, chunks, top_k=5)

print("\nFinal Answer:\n", answer)
print("\nCitations:\n", citations)


Final Answer:
 Here is what I found based on the documents:

Source: Student and Child Student
page 47 of 75 published for home office staff on 22 july 2025 option new entrant criteria information required from sponsor the sponsor must confirm which recognised professional qualification the applicant is working towards – this must be a uk qualification accepted by the regulatory body for the profession. 4. applicant is working towards full registration or chartered status with the relevant professional body for the job they are sponsored for. the sponsor must confirm who the relevant professional body is and the applicant is working towards full registration or chartered status. if necessary, you can check the list of chartered bodies from the privy council. 5. applicant ’ s most recent permission, other than as a visitor, was under tier 4 ( general ) or the student route. the permission must either be current or have expired less than 2 years before the date of application. in that p

In [21]:
chunks_path = r"C:\Users\Anvitha\OneDrive\Documents\chunks_with_embeddings.jsonl"
model, index, chunks = build_faiss_index(chunks_path)

query = "Is the student eligible for UK Student Visa?"
answer, citations = generate_answer(query, model, index, chunks, top_k=5)

print("\nFinal Answer:\n", answer)

print("\nCitations:\n", citations)


Final Answer:
 Here is what I found based on the documents:

Source: Student and Child Student
professional graduate diploma of education the applicant must have completed ( or be applying no more than 3 months before they are expected to complete ) the course. an exception applies to phds and other doctoral qualifications, where the applicant must have completed at least 12 months ’ study in the uk towards the qualification. confirmation ( from the applicant or their sponsor ) the applicant has completed ( or the date they are expected to complete ) their course, or they have completed at least 12 months ’ study in the uk towards a phd or other doctoral qualification. check the applicant ’ s immigration history for other information.

Source: Student and Child Student
page 47 of 75 published for home office staff on 22 july 2025 option new entrant criteria information required from sponsor the sponsor must confirm which recognised professional qualification the applicant is working t

In [23]:
chunks_path = r"C:\Users\Anvitha\OneDrive\Documents\chunks_with_embeddings.jsonl"
model, index, chunks = build_faiss_index(chunks_path)

query = "Where is my keyboard?"
answer, citations = generate_answer(query, model, index, chunks, top_k=5)

print("\nFinal Answer:\n", answer)
print("\nCitations:\n", citations)


Final Answer:
 No relevant information found.

Citations:
 []


In [24]:
chunks_path = r"C:\Users\Anvitha\OneDrive\Documents\chunks_with_embeddings.jsonl"
model, index, chunks = build_faiss_index(chunks_path)

query = "I am from India and applying for UK Student Visa. Do I need to improve my English speaking ability to get UK Student Visa?"
answer, citations = generate_answer(query, model, index, chunks, top_k=5)

print("\nFinal Answer:\n", answer)
print("\nCitations:\n", citations)


Final Answer:
 Here is what I found based on the documents:

Source: Student and Child Student
page 28 of 75 published for home office staff on 22 july 2025 english language this section explains how to assess the english language requirement for skilled workers. the applicant must score 10 points for english language skills equivalent to level b1 of the common european framework of references for english language in all 4 components ( reading, writing, speaking and listening ). to award these points, you must be satisfied the application meets the requirements in paragraphs sw 7. 1. to sw 7. 4. to assess whether the requirement is met, you should refer to the english language guidance. non - poise see : english language guidance. related content contents mandatory points requirement

Source: Student and Child Student
professional graduate diploma of education the applicant must have completed ( or be applying no more than 3 months before they are expected to complete ) the course. an