In [1]:
from pathlib import Path
import sys

# Project root = one level above notebooks
PROJECT_ROOT = Path.cwd().parent

# allows imports from src
sys.path.append(str(PROJECT_ROOT))

In [2]:
# Calling modules from src folder for testing
from src.ingestion.chunking import chunk_text
from src.embeddings.embedder import Embedder
from src.retrieval.faiss_store import FaissStore
from src.llm.local_llm import LocalLLM


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Calling modules from src folder for testing

sample_text = """
Customers can request a refund within 30 days of purchase.
The item must be unused and returned in its original packaging.
Refunds are processed within 5 business days.
"""

In [4]:
# Calling modules from src folder for testing
chunks = chunk_text(sample_text)
print("Number of chunks:", len(chunks))

Number of chunks: 1


In [5]:
# Create embeddings
embedder = Embedder()

chunk_embeddings = embedder.embed(chunks)
print(chunk_embeddings.shape)

Loading weights: 100%|█| 103/103 [00:00<00:00, 591.80it
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
Batches: 100%|███████████| 1/1 [00:00<00:00,  3.57it/s]

(1, 384)





In [6]:
# Store & retrieve with FAISS

store = FaissStore(embedding_dim=chunk_embeddings.shape[1])
store.add(chunk_embeddings, chunks)

In [7]:
query = "What is the refund policy"

query_embedding = embedder.embed([query])
retrieved_chunks = store.search(query_embedding, k=2)

retrieved_chunks

Batches: 100%|███████████| 1/1 [00:00<00:00,  7.43it/s]


['\nCustomers can request a refund within 30 days of purchase.\nThe item must be unused and returned in its original packaging.\nRefunds are processed within 5 business days.\n',
 '\nCustomers can request a refund within 30 days of purchase.\nThe item must be unused and returned in its original packaging.\nRefunds are processed within 5 business days.\n']

Generate answer with Local LLM

In [8]:
context = "\n\n".join(retrieved_chunks)

prompt = f"""
You are a helpful assistant.
Answer the question using ONLY the context below.
If the answer is not in the context, say "I don't know".

Context:
{context}

Question:
{query}

Answer:
"""

Load LLM (path-safe)

In [9]:
MODEL_PATH = PROJECT_ROOT / "models" / "phi-2.gguf"

llm = LocalLLM(model_path=MODEL_PATH)

ggml_metal_init: skipping kernel_soft_max_f16                      (not supported)
ggml_metal_init: skipping kernel_soft_max_f16_4                    (not supported)
ggml_metal_init: skipping kernel_soft_max_f32                      (not supported)
ggml_metal_init: skipping kernel_soft_max_f32_4                    (not supported)
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_set_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_rms_norm                          (not supported)
ggml_metal_init: skipping kernel_rms_norm_mul                      (not supported)
ggml_metal_init: skipping kernel_rms_norm_mul_add                  (not supported)
ggml_metal_init: skipping kernel_l2_norm                           (not supported)
ggml_metal_init: skipping kernel_group_norm                        (not supported)
ggml_metal_init: skipping kernel_mul_mv_f32_f32                    (not supported)
ggml

In [10]:
answer = llm.generate(prompt)
print(answer)

Refunds are processed within 5 business days.



In [11]:
from pathlib import Path
from src.pipeline import RAGPipeline

PROJECT_ROOT = Path.cwd().parent
MODEL_PATH = PROJECT_ROOT / "models" / "phi-2.gguf"

pipeline = RAGPipeline(model_path=MODEL_PATH)

docs = [
    """
    Customers can request a refund within 30 days of purchase.
    The item must be unused and returned in its original packaging.
    """
]

pipeline.ingest(docs)

answer = pipeline.query("What is the refund policy?")
print(answer)


Loading weights: 100%|█| 103/103 [00:00<00:00, 466.21it
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
ggml_metal_init: skipping kernel_soft_max_f16                      (not supported)
ggml_metal_init: skipping kernel_soft_max_f16_4                    (not supported)
ggml_metal_init: skipping kernel_soft_max_f32                      (not supported)
ggml_metal_init: skipping kernel_soft_max_f32_4                    (not supported)
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_set_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_rms_norm                          (not supported)
ggml_metal_init: skipping kernel_rms_norm_mul 

"Customers can request a refund within 30 days of purchase. The item must be unused and returned in its original packaging."



In [12]:
pipeline.ingest(docs)
answer = pipeline.query("What is the refund policy?")
print(answer)

Batches: 100%|███████████| 1/1 [00:00<00:00,  1.08it/s]
Batches: 100%|███████████| 1/1 [00:00<00:00, 13.02it/s]


I don't know.





In [14]:
# Type here

In [15]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")



Loading weights: 100%|█| 103/103 [00:00<00:00, 494.68it
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [16]:
sentences = [
    "I love this product",
    "This item is amazing",
    "I hate this product",
    "The weather is sunny today"
]

In [17]:
# Generate embeddings
embeddings = model.encode(sentences)

embeddings.shape

(4, 384)

In [18]:
# Simulate chunking using plain text
text = """
Refund Policy:
Customers can request a refund within 30 days of purchase.
The item must be unused and in original packaging.

Shipping Policy:
Orders are shipped within 2–3 business days.
Delivery time depends on location.

Privacy Policy:
We do not share customer data with third parties.
Personal information is stored securely.
"""

In [19]:
def chunk_text(text, chunk_size=100, overlap=20):
    words = text.split()
    chunks = []

    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = words[start:end]
        chunks.append(" ".join(chunk))
        start += chunk_size - overlap

    return chunks

In [20]:
chunks = chunk_text(text, chunk_size=40, overlap=10)

for i, chunk in enumerate(chunks):
    print(f"\n--- Chunk {i+1} ---")
    print(chunk)


--- Chunk 1 ---
Refund Policy: Customers can request a refund within 30 days of purchase. The item must be unused and in original packaging. Shipping Policy: Orders are shipped within 2–3 business days. Delivery time depends on location. Privacy Policy: We do not

--- Chunk 2 ---
Delivery time depends on location. Privacy Policy: We do not share customer data with third parties. Personal information is stored securely.


In [21]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

chunk_embeddings = model.encode(chunks)
chunk_embeddings.shape

Loading weights: 100%|█| 103/103 [00:00<00:00, 473.53it
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


(2, 384)

#### Implement FAISS
FAISS is very fast math
    optimized similarity search

In [22]:
# Create FAISS index
import faiss
import numpy as np

dimension = chunk_embeddings.shape[1]

index = faiss.IndexFlatL2(dimension)
index.add(chunk_embeddings)

In [23]:
query = "What is the refund policy?"

query_embedding = model.encode([query])

In [24]:
# Search
# Number of chunks to retrieve
k = 2

distances, indices = index.search(query_embedding, k)
indices

array([[0, 1]])

In [25]:
# Retrieve text

for idx in indices[0]:
    print("\n--- Retrieved Chunk ---")
    print(chunks[idx])


--- Retrieved Chunk ---
Refund Policy: Customers can request a refund within 30 days of purchase. The item must be unused and in original packaging. Shipping Policy: Orders are shipped within 2–3 business days. Delivery time depends on location. Privacy Policy: We do not

--- Retrieved Chunk ---
Delivery time depends on location. Privacy Policy: We do not share customer data with third parties. Personal information is stored securely.


### Hugging Face Inference call for FLAN-T5

In [26]:
# Create prompt from retrieved chunks

context = "\n\n".join([chunks[idx] for idx in indices[0]])

prompt = f"""
You are a helpful assistant.
Answer the question using ONLY the context below.
If the answer is not in the context, say "I don't know".

Context:
{context}


Question:
{query}
"""

### Generate answer

In [27]:
import requests
import os

HF_TOKEN = os.getenv("HF_TOKEN")
MODEL_ID = "google/flan-t5-large"

API_URL = f"https://api-inference.huggingface.co/models/{MODEL_ID}"

headers = {
    "Authorization": f"Bearer {HF_TOKEN}",
    "Content-Type": "application/json"
}

payload = {
    "inputs": prompt,
    "parameters": {
        "max_new_tokens": 200,
        "temperature": 0
    }
}

response = requests.post(API_URL, headers=headers, json=payload)

print(response.status_code)
print(response.json())


410
{'error': 'https://api-inference.huggingface.co is no longer supported. Please use https://router.huggingface.co instead.'}


In [28]:
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent

MODEL_PATH = PROJECT_ROOT / "models" / "phi-2.gguf"

# print(MODEL_PATH)
print(MODEL_PATH.exists())

True


In [29]:
from llama_cpp import Llama

llm = Llama(
    model_path=str(MODEL_PATH),
    n_ctx=2048,
    n_threads=4,
    verbose=False
)

ggml_metal_init: skipping kernel_soft_max_f16                      (not supported)
ggml_metal_init: skipping kernel_soft_max_f16_4                    (not supported)
ggml_metal_init: skipping kernel_soft_max_f32                      (not supported)
ggml_metal_init: skipping kernel_soft_max_f32_4                    (not supported)
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_set_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_rms_norm                          (not supported)
ggml_metal_init: skipping kernel_rms_norm_mul                      (not supported)
ggml_metal_init: skipping kernel_rms_norm_mul_add                  (not supported)
ggml_metal_init: skipping kernel_l2_norm                           (not supported)
ggml_metal_init: skipping kernel_group_norm                        (not supported)
ggml_metal_init: skipping kernel_mul_mv_f32_f32                    (not supported)
ggml

In [30]:
# llm("What is retrieval augmented generation?")


In [31]:
context = "\n\n".join([chunks[idx] for idx in indices[0]])

prompt = f"""
You are a helpful assistant.
Answer the question using ONLY the context below.
If the answer is not in the context, say "I don't know".

Context:
{context}

Question:
{query}

Answer:
"""


In [32]:
output = llm(prompt, max_tokens=256)
answer = output["choices"][0]["text"]
print(answer)

Within 30 days of purchase, customers can request a refund.



In [33]:
# print(llm("Give a short definition of RAG."))


In [34]:
# Just to test, need to be removed

from src.ingestion.chunking import chunk_text

chunks = chunk_text("This is a test document." * 50)
len(chunks)

3