In [None]:
!pip install -q langchain langchain-community faiss-cpu sentence-transformers pymongo gradio transformers accelerate sentencepiece

In [None]:
!ls /kaggle/input

In [None]:
!mkdir -p /kaggle/working/yoga-rag-microapp/backend/rag
!mkdir -p /kaggle/working/yoga-rag-microapp/backend/llm
!mkdir -p /kaggle/working/yoga-rag-microapp/frontend
!mkdir -p /kaggle/working/yoga-rag-microapp/data
!mkdir -p /kaggle/working/yoga-rag-microapp/assets

!mkdir -p /kaggle/working/yoga-rag-microapp/faiss_index   # generated (will be ignored in git)

In [None]:
!cp -r /kaggle/input/yoga-kb/yoga_docs /kaggle/working/yoga-rag-microapp/data/
!ls /kaggle/working/yoga-rag-microapp/data/yoga_docs | head

In [None]:
%%writefile /kaggle/working/yoga-rag-microapp/backend/config.py
from pathlib import Path
import os

# ONLY CHANGE: __file__ fallback for notebook environments
try:
    PROJECT_ROOT = Path(__file__).resolve().parents[1]
except NameError:
    # Notebook fallback
    cwd = Path(os.getcwd()).resolve()

    # If already in project root
    if (cwd / "backend").exists() and (cwd / "frontend").exists():
        PROJECT_ROOT = cwd
    # If inside /kaggle/working and project folder exists
    elif (cwd / "yoga-rag-microapp").exists():
        PROJECT_ROOT = cwd / "yoga-rag-microapp"
    else:
        # Last fallback: search upwards
        PROJECT_ROOT = None
        for p in [cwd] + list(cwd.parents):
            if (p / "backend").exists() and (p / "frontend").exists():
                PROJECT_ROOT = p
                break
        if PROJECT_ROOT is None:
            raise RuntimeError("Could not locate project root. Please cd into project folder.")

DATA_PATH = PROJECT_ROOT / "data" / "yoga_docs"
VECTOR_DB_PATH = PROJECT_ROOT / "faiss_index"

In [None]:
%%writefile /kaggle/working/yoga-rag-microapp/.gitignore
__pycache__/
*.pyc
.ipynb_checkpoints/

.env
faiss_index/
*.zip

# Kaggle generated
/kaggle/

In [None]:
%%writefile /kaggle/working/yoga-rag-microapp/.env.example
MONGO_URI=mongodb+srv://<username>:<password>@<cluster>.mongodb.net/?retryWrites=true&w=majority
DB_NAME=nextyou_rag
COLLECTION_NAME=query_logs

MODEL_NAME=google/flan-t5-xl
EMBEDDING_MODEL=sentence-transformers/all-mpnet-base-v2

In [None]:
%%writefile /kaggle/working/yoga-rag-microapp/requirements.txt
langchain
langchain-community
faiss-cpu
sentence-transformers
pymongo
gradio
transformers
accelerate
sentencepiece
torch

In [None]:
!pip install -r /kaggle/working/yoga-rag-microapp/requirements.txt

In [None]:
%%writefile /kaggle/working/yoga-rag-microapp/backend/safety.py
def check_safety(query: str):
    query = query.lower()

    pregnancy_keywords = [
        "pregnant", "pregnancy", "trimester", "prenatal"
    ]

    medical_keywords = [
        "hernia", "glaucoma", "high blood pressure",
        "bp", "hypertension", "surgery", "injury",
        "chronic pain", "operation"
    ]

    for word in pregnancy_keywords:
        if word in query:
            return {"isUnsafe": True, "reason": "pregnancy"}

    for word in medical_keywords:
        if word in query:
            return {"isUnsafe": True, "reason": "medical_condition"}

    return {"isUnsafe": False, "reason": None}

In [None]:
%%writefile /kaggle/working/yoga-rag-microapp/backend/db.py
import os
from datetime import datetime
from pymongo import MongoClient

def get_mongo_uri():
    # 1) Kaggle secrets
    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        uri = user_secrets.get_secret("MONGO_URI")
        if uri and uri.strip():
            print(" Loaded MONGO_URI from Kaggle Secrets")
            return uri.strip()
    except Exception as e:
        print("Kaggle secret missing. Will try env var.")
        print("Reason:", str(e))

    # 2) Local env
    uri = os.getenv("MONGO_URI", "")
    if uri and uri.strip():
        print("Loaded MONGO_URI from environment")
        return uri.strip()

    print("No MONGO_URI found - DB logging disabled.")
    return None


MONGO_URI = get_mongo_uri()

client = MongoClient(MONGO_URI) if MONGO_URI else None

DB_NAME = os.getenv("DB_NAME", "nextyou_rag")
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "query_logs")

collection = None
if client:
    db = client[DB_NAME]
    collection = db[COLLECTION_NAME]


def log_query(query, retrieved_chunks, answer, is_unsafe, safety_reason=None):
    if collection is None:
        return  # no DB logging

    collection.insert_one({
        "query": query,
        "retrieved_chunks": retrieved_chunks,
        "answer": answer,
        "isUnsafe": is_unsafe,
        "safety_reason": safety_reason,
        "timestamp": datetime.utcnow()
    })

In [None]:
%%writefile /kaggle/working/yoga-rag-microapp/backend/rag/ingest.py
from backend.config import DATA_PATH, VECTOR_DB_PATH

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS


def load_documents():
    docs = []
    for file_path in sorted(DATA_PATH.glob("*.txt")):
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()
            docs.append(Document(page_content=content, metadata={"source": file_path.name}))
    return docs


def build_faiss_index():
    docs = load_documents()

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500,
        chunk_overlap=150
    )

    chunks = splitter.split_documents(docs)

    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )

    db = FAISS.from_documents(chunks, embeddings)
    db.save_local(str(VECTOR_DB_PATH))

    print(f" FAISS index built with {len(chunks)} chunks at: {VECTOR_DB_PATH}")


if __name__ == "__main__":
    build_faiss_index()

In [None]:
%%writefile /kaggle/working/yoga-rag-microapp/backend/rag/retriever.py
from backend.config import VECTOR_DB_PATH

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS


def retrieve_chunks(query, k=8):
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )

    vectorstore = FAISS.load_local(
        str(VECTOR_DB_PATH),
        embeddings,
        allow_dangerous_deserialization=True
    )

    results = vectorstore.similarity_search(query, k=k)

    retrieved = []
    for idx, doc in enumerate(results):
        retrieved.append({
            "chunk_id": idx,
            "source": doc.metadata.get("source", "unknown"),
            "content": doc.page_content
        })

    return retrieved

In [None]:
%%writefile /kaggle/working/yoga-rag-microapp/backend/llm/local_llm.py
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import os

MODEL_NAME = os.getenv("MODEL_NAME", "google/flan-t5-xl")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)

def generate_answer(query, retrieved_chunks, is_unsafe=False):
    context = "\n\n".join(
        f"Source: {c['source']}\n{c['content']}"
        for c in retrieved_chunks
    ).strip()

    safety_note = ""
    if is_unsafe:
        safety_note = (
            "SAFETY RULES:\n"
            "- The query involves pregnancy or a medical condition.\n"
            "- Do NOT give medical advice.\n"
            "- Provide a gentle warning and recommend consulting a professional.\n\n"
        )

    prompt = f"""
You are a yoga and wellness assistant.

{safety_note}
INSTRUCTION:
Answer ONLY using the context.
If the answer is not in the context, reply exactly:
"I‚Äôm not sure based on the provided knowledge base."

CONTEXT:
{context}

QUESTION:
{query}

ANSWER:
""".strip()

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    )

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=220,
            temperature=0.2,
            do_sample=True
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
%%writefile /kaggle/working/yoga-rag-microapp/frontend/app_gradio.py
import gradio as gr
from pathlib import Path
import os
import sys

# ONLY CHANGE: __file__ fallback for notebook environments
try:
    PROJECT_ROOT = Path(__file__).resolve().parents[1]
except NameError:
    cwd = Path(os.getcwd()).resolve()
    if (cwd / "backend").exists():
        PROJECT_ROOT = cwd
    else:
        PROJECT_ROOT = cwd / "yoga-rag-microapp"

sys.path.append(str(PROJECT_ROOT / "backend"))

from safety import check_safety
from rag.retriever import retrieve_chunks
from llm.local_llm import generate_answer
from db import log_query


def answer_question(query):
    if not query.strip():
        return "Please enter a question.", "", ""

    safety = check_safety(query)
    is_unsafe = safety["isUnsafe"]
    reason = safety["reason"]

    retrieved_chunks = retrieve_chunks(query, k=3)

    answer = generate_answer(
        query=query,
        retrieved_chunks=retrieved_chunks,
        is_unsafe=is_unsafe
    )

    sources = "\n".join(
        f"- {chunk['source']}" for chunk in retrieved_chunks
    ) or "No sources found."

    warning = ""
    if is_unsafe:
        warning = (
            "‚ö†Ô∏è SAFETY WARNING: This query may involve health risks.\n"
            "Please consult a doctor or certified yoga therapist."
        )

    log_query(
        query=query,
        retrieved_chunks=retrieved_chunks,
        answer=answer,
        is_unsafe=is_unsafe,
        safety_reason=reason
    )

    return answer, sources, warning


demo = gr.Interface(
    fn=answer_question,
    inputs=gr.Textbox(label="Ask anything about yoga"),
    outputs=[
        gr.Textbox(label="Answer"),
        gr.Textbox(label="Sources Used"),
        gr.Textbox(label="Safety Warning")
    ],
    title="üßò Ask Me Anything About Yoga",
    description="RAG wellness assistant using FAISS retrieval + safety guardrails + MongoDB logging."
)

demo.launch(share=True)

In [None]:
%cd /kaggle/working/yoga-rag-microapp
!python -m backend.rag.ingest
!python -m frontend.app_gradio

In [None]:
#!cd /kaggle/working && zip -r yoga-rag-microapp.zip yoga-rag-microapp
#!ls -lh /kaggle/working/yoga-rag-microapp.zip