# Cleaning, Preprocessing, and chunking

In [1]:
import json
import re
import os
from pathlib import Path
from tqdm import tqdm

# Constants
INPUT_FILE = "data.json"
OUTPUT_FILE = "Data/chunks.json"
CHUNK_SIZE = 300  # Number of words per chunk

# Clean and chunk text
def clean_text(text):
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\n", " ", text)
    return text.strip()

def chunk_text(text, size=CHUNK_SIZE):
    words = text.split()
    return [" ".join(words[i:i+size]) for i in range(0, len(words), size)]

# Load JSON input
chunks = []

with open(INPUT_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)

for row in tqdm(data, desc="Chunking documents"):
    content = clean_text(row.get("content", ""))
    title = row.get("title", "")
    url = row.get("url", "")

    if not content:
        continue

    content_chunks = chunk_text(content)
    for idx, chunk in enumerate(content_chunks):
        chunks.append({
            "title": title,
            "url": url,
            "chunk_id": idx,
            "content": chunk
        })

# Save output
os.makedirs(Path(OUTPUT_FILE).parent, exist_ok=True)
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(chunks, f, ensure_ascii=False, indent=2)

print(f"✅ Chunked data saved to {OUTPUT_FILE} with {len(chunks)} chunks.")


Chunking documents: 100%|██████████| 64/64 [00:00<00:00, 1337.27it/s]

✅ Chunked data saved to Data/chunks.json with 161 chunks.





# Embedding chunks

In [2]:
import json
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize

# === Step 1: Load your JSON file ===
with open('Data/chunks.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

chunks = [item['content'] for item in data if 'content' in item]

# === Step 2: Load embedding model ===
model = SentenceTransformer('all-MiniLM-L6-v2')

# === Step 3: Generate embeddings ===
embeddings = model.encode(chunks, show_progress_bar=True)

# === Step 4: Normalize embeddings to unit length (L2 norm = 1)
embeddings = normalize(embeddings, norm='l2')  # Now cosine similarity = inner product

# === Step 5: Save embeddings and chunks ===
with open('Data/embedded_chunks.pkl', 'wb') as f:
    pickle.dump({'chunks': chunks, 'embeddings': embeddings}, f)

print("✅ Normalized embeddings saved for cosine similarity.")


  from .autonotebook import tqdm as notebook_tqdm





Batches: 100%|██████████| 6/6 [00:05<00:00,  1.04it/s]

✅ Normalized embeddings saved for cosine similarity.





# Vectore Storing "FAISS"

In [3]:
import pickle
import faiss
import numpy as np

# === Step 1: Load normalized embeddings and chunks ===
with open('Data/embedded_chunks.pkl', 'rb') as f:
    data = pickle.load(f)

chunks = data['chunks']
embeddings = np.array(data['embeddings']).astype('float32')

# === Step 2: Create FAISS index using Inner Product (cosine similarity)
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatIP(embedding_dim)  # Inner Product = cosine if normalized
index.add(embeddings)

# === Step 3: Save FAISS index and chunks ===
faiss.write_index(index, 'Data/index.faiss')

with open('Data/chunks_only.pkl', 'wb') as f:
    pickle.dump(chunks, f)

print("✅ FAISS cosine similarity index and chunks saved.")


✅ FAISS cosine similarity index and chunks saved.


# RAG

In [6]:
import pickle
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
import google.generativeai as genai

# === Step 1: Load FAISS index and chunks ===
index = faiss.read_index('Data/index.faiss')

with open('Data/chunks_only.pkl', 'rb') as f:
    chunks = pickle.load(f)

# === Step 2: Load embedding model ===
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# === Step 3: Set up Gemini API ===
genai.configure(api_key="GEMENI_API_KEY")
model = genai.GenerativeModel('gemini-2.0-flash')

# === Step 4: Define search function ===
def get_relevant_chunks(query, top_k=3):
    query_vec = embedder.encode([query])
    query_vec = normalize(query_vec, norm='l2').astype('float32')  # normalize query too

    D, I = index.search(query_vec, top_k)
    retrieved_chunks = [chunks[i] for i in I[0]]

    print("\n🔍 Top Matching Chunks:")
    for idx, chunk in enumerate(retrieved_chunks):
        print(f"\n--- Chunk {idx+1} ---\n{chunk[:300]}...\n")

    return retrieved_chunks

# === Step 5: Generate response with Gemini ===
def generate_answer_with_gemini(query, context_chunks):
    context_text = "\n\n".join(context_chunks)
    prompt = f"""You are a helpful assistant. Use the following context to answer the user's question:

Context:
{context_text}

Question: {query}

Answer:"""

    response = model.generate_content(prompt)
    return response.text.strip()

In [7]:
# === Step 6: Try with a user question ===
user_query = "What is depression?"

retrieved = get_relevant_chunks(user_query)
answer = generate_answer_with_gemini(user_query, retrieved)

print("\n💬 Final Answer:\n", answer)


🔍 Top Matching Chunks:

--- Chunk 1 ---
Mild depression can make you feel low and as though everything is harder to do. Severe depression can lead to feeling hopeless and, in some cases, suicidal. If you re depressed, you re not alone. In England, 3 in every 100 people will experience depression in any given week. Even more – 8 in every 1...


--- Chunk 2 ---
Having a baby is a huge life event. It’s normal to experience a range of powerful emotions while you’re pregnant and after giving birth: excitement, joy, and anxiety. You may also feel depressed. It’s not a sign of weakness or anything to feel guilty about. With support and treatment, you can get be...


--- Chunk 3 ---
your life - but try to stay open to the possibility of change. There are many different types of help available now. A common treatment for depression involves a combination of self-help, talking therapies and medication. The right treatment for you will depend on the type of depression you have and...


💬 Final A

In [8]:
# === Step 6: Try with a user question ===
user_query = "Talk to me about mental health."

retrieved = get_relevant_chunks(user_query)
answer = generate_answer_with_gemini(user_query, retrieved)

print("\n💬 Final Answer:\n", answer)


🔍 Top Matching Chunks:

--- Chunk 1 ---
you can look after yourself. Home of Mental Health Awareness Week...


--- Chunk 2 ---
your mental health affects your physical health, and what you can do to help yourself. Home of Mental Health Awareness Week...


--- Chunk 3 ---
keeping a diary of your voices. You could note what they say, how they make you feel and how you manage them. This may help you to notice patterns of what makes you feel bad, what makes you feel good, or what triggers your voices. Some people find that standing up to the voices, choosing when to pay...


💬 Final Answer:
 Mental health affects your physical health, and there are things you can do to help yourself. For example, if you experience voices, keeping a diary of them can help you identify patterns and triggers. Some people find it helpful to stand up to the voices, choosing when to pay attention and focusing on more positive ones. Talking therapy can be beneficial, as can keeping busy with hobbies, creative ac