In [4]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import json
from transformers import pipeline

In [3]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [5]:
# -----------------------------
# 1. Build FAISS Index in rag_pipeline
# -----------------------------
def build_faiss_index(chunks_path):
    # Load model
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    # Load chunks from file
    chunks = []
    with open(chunks_path, "r", encoding="utf-8") as f:
        for line in f:
            chunks.append(json.loads(line))

    # Create FAISS index
    embedding_dim = len(chunks[0]['embedding'])
    index = faiss.IndexFlatL2(embedding_dim)
    embeddings = np.array([chunk['embedding'] for chunk in chunks]).astype('float32')
    index.add(embeddings)

    return model, index, chunks


In [6]:
# -----------------------------
# 2. Retrieve function
# -----------------------------
def retrieve_chunks(query, model, index, chunks, top_k=5):
    query_embedding = model.encode(query).astype('float32')
    D, I = index.search(np.array([query_embedding]), top_k)
    results = [chunks[i] for i in I[0]]
    return results


In [7]:
# -----------------------------
# 3. Load generator model
# -----------------------------
generator = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    tokenizer="google/flan-t5-base",
    device=-1  # CPU; use 0 for GPU
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [8]:
# -----------------------------
# 4. Answer generation
# -----------------------------
def generate_answer(query, model, index, chunks, top_k=5):

    # Step 1: Retrieve
    retrieved = retrieve_chunks(query, model, index, chunks, top_k=top_k)

    # Step 2: Prepare context
    context = "\n\n".join([f"Source: {r.get('source', 'Unknown')}\n{r['text']}" for r in retrieved])

    # Step 3: Build prompt
    prompt = f"""
    You are an expert eligibility officer.
    Using only the context below, answer the question truthfully.
    If the answer is not in the context, say "I cannot find relevant information."

    Context:
    {context}

    Question: {query}
    Answer:
    """

    # Step 4: Generate
    output = generator(prompt, max_new_tokens=200)

    # Step 5: Collect citations
    #citations = list({r.get('source', 'Unknown') for r in retrieved})
    citations = [f"{r.get('source', 'Unknown')} — {r['text']}" for r in retrieved]

    return output[0]["generated_text"], citations



In [11]:
from google.colab import files
import os
uploaded = files.upload()  # Select your chunks_with_embeddings.jsonl file
chunks_file = list(uploaded.keys())[0]
chunks_path = os.path.join("/content", chunks_file)

# Build FAISS index
model, index, chunks = build_faiss_index(chunks_path)

# Query
query = "Is the student eligible for UK Student Visa?"
answer, citations = generate_answer(query, model, index, chunks, top_k=5)

# Output
print("\nFinal Answer:\n", answer)
print("\nCitations:\n", citations)

Saving chunks_with_embeddings.jsonl to chunks_with_embeddings.jsonl


Token indices sequence length is longer than the specified maximum sequence length for this model (1996 > 512). Running this sequence through the model will result in indexing errors



Final Answer:
 Yes

Citations:
 ['Student and Child Student — Student visa\n1. Overview\nYou can apply for a Student visa to study in the UK if you’re 16 or over and\nyou:\nhave been offered a place on a course by a licensed student sponsor\n(/student-visa/course)\nhave enough money to support yourself and pay for your course (/student-\nvisa/money) - the amount will vary depending on your circumstances\ncan speak, read, write and understand English (/student-visa/knowledge-of-\nenglish)\nhave consent from your parents if you’re 16 or 17 - you’ll need evidence\nof this when you apply\nIf you’re 16 or 17 and you want to study at an independent school in the\nUK, you may be eligible for a Child Student visa (/child-study-visa)\ninstead.\nThis visa has replaced the Tier 4 (General) student visa.\nWhen to apply\nWhen you can apply depends on whether you’re applying from inside or\noutside the UK.\nApplying from outside the UK\nThe earliest you can apply for a visa is 6 months before you s

In [12]:
from google.colab import files
import os

uploaded = files.upload()

chunks_file = list(uploaded.keys())[0]
chunks_path = os.path.join("/content", chunks_file)

model, index, chunks = build_faiss_index(chunks_path)

query = "What are the eligibility requirements for a UK Student Visa?"
answer, citations = generate_answer(query, model, index, chunks, top_k=5)

print("\nFinal Answer:\n", answer)
print("\nCitations:\n", citations)


Saving chunks_with_embeddings.jsonl to chunks_with_embeddings (1).jsonl

Final Answer:
 16 or over and you: have been offered a place on a course by a licensed student sponsor (/student-visa/course) have enough money to support yourself and pay for your course (/student-visa/money) - the amount will vary depending on your circumstances can speak, read, write and understand English (/student-visa/knowledge-of-english) have consent from your parents if you’re 16 or 17 and you want to study at an independent school in the UK, you may be eligible for a Child Student visa (/child-study-visa) instead. This visa has replaced the Tier 4 (General) student visa.

Citations:
 ['Student and Child Student — Student visa\n1. Overview\nYou can apply for a Student visa to study in the UK if you’re 16 or over and\nyou:\nhave been offered a place on a course by a licensed student sponsor\n(/student-visa/course)\nhave enough money to support yourself and pay for your course (/student-\nvisa/money) - the 

In [13]:
from google.colab import files
import os

uploaded = files.upload()  # Select 'chunks_with_embeddings_new.jsonl'

chunks_file = list(uploaded.keys())[0]
chunks_path = os.path.join("/content", chunks_file)

model, index, chunks = build_faiss_index(chunks_path)

query = "I am from Canada and applying for a UK Student Visa. Do I need to prove my English language ability?"
answer, citations = generate_answer(query, model, index, chunks, top_k=5)

print("\nFinal Answer:\n", answer)
print("\nCitations:\n", citations)


Saving chunks_with_embeddings.jsonl to chunks_with_embeddings (2).jsonl

Final Answer:
 Yes

Citations:
 ["Student and Child Student — This must still be equivalent to a CEFR level B2.\nWho does not need to prove their knowledge of\nEnglish\nYou do not need to prove your knowledge of English if you’re from one of\nthe following countries or territories, or you’ve completed a qualification\nequivalent to a UK degree in one of them:\nAntigua and Barbuda\nAustralia\nthe Bahamas\nBarbados\nBelize\nthe British overseas territories\nDominica\nGrenada\nGuyana\nIreland\nJamaica\nMalta\nNew Zealand\nSt Kitts and Nevis\nSt Lucia\nSt Vincent and the Grenadines\nTrinidad and Tobago\nUK\nUSA\nYou also do not need to prove your knowledge of English if one of the\nfollowing applies:\nyou’re a national of Canada\nyou’re applying to come to the UK for a study abroad programme as part\nof a university degree course in the USA\nyou proved your level of English in a previous visa application\n5. Documents