<a href="https://colab.research.google.com/github/PradhyumnaPrakash/Ancient-Indian-Philosophy-Based-RAG-Systems/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing


**Converting Bhagavad Gita into JSONL**



In [None]:
#Import Bhagavad Gita and Itihasa Dataset

from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd

# Load Bhagavad Gita dataset
gita = pd.read_csv("Bhagwad_Gita.csv")  # change filename as needed

print(gita.head())
print(gita.columns)
print("Rows in Bhagavad Gita:", len(gita))


In [None]:
# Keep only the important ones
gita = gita[['Transliteration', 'EngMeaning']]

gita = gita.rename(columns={
    'Transliteration': 'sanskrit_text',
    'EngMeaning': 'english_text'
})

# Preview
gita.head()

In [None]:
# Save as CSV
gita.to_csv("bhagavad_gita_clean.csv", index=False)

# Save as JSONL
import json

with open("bhagavad_gita_clean.jsonl", "w", encoding="utf-8") as f:
    for _, row in gita.iterrows():
        record = {
            "english_text": row["english_text"],
            "sanskrit_text": row["sanskrit_text"]
        }
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

print("Rows in Bhagavad Gita:", len(gita))

**Converting Itihasa into JSONL**

In [None]:
import pandas as pd

# Load them
train = pd.read_csv("Itihasa_training.csv")
test  = pd.read_csv("Itihasa_testing.csv")
val   = pd.read_csv("Itihasa_validation.csv")

# Combine
df = pd.concat([train, test, val], ignore_index=True)

# Save as a single CSV
df.to_csv("itihasa.csv", index=False)

print("Combined CSV saved as itihasa.csv")

In [None]:
# Keep only English + Sanskrit columns, rename them
df_jsonl = df[["English", "Sanskrit"]].rename(columns={
    "English": "english_text",
    "Sanskrit": "sanskrit_text"
})

# Save to JSONL
with open("itihasa.jsonl", "w", encoding="utf-8") as f:
    for _, row in df_jsonl.iterrows():
        f.write(row.to_json(force_ascii=False) + "\n")

print("JSONL saved as itihasa.jsonl")

**Combine Gita and Itihasa**

In [None]:
import json

def read_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f if line.strip()]

gita = read_jsonl("bhagavad_gita_clean.jsonl")
iti  = read_jsonl("itihasa.jsonl")

combined = []
for row in gita:
    row["source"] = "bhagavad_gita"
    combined.append(row)
for row in iti:
    row["source"] = "itihasa"
    combined.append(row)

with open("itihasa_gita_combined.jsonl", "w", encoding="utf-8") as f:
    for r in combined:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

print("Saved itihasa_gita_combined.jsonl")

# FAISS Index

In [None]:
!pip install faiss-cpu sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import pandas as pd

# Use English text for embeddings
texts = [d["english_text"] for d in docs]

# Multilingual embedding model (good for English + Sanskrit in context)
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
embs = model.encode(texts, normalize_embeddings=True, convert_to_numpy=True)

# Build FAISS index
dim = embs.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embs.astype("float32"))

# Metadata store (for later retrieval of Sanskrit + source)
store = pd.DataFrame({
    "id": range(len(docs)),
    "english_text": [d["english_text"] for d in docs],
    "sanskrit_text": [d.get("sanskrit_text", "") for d in docs],
    "source": [d["source"] for d in docs]
})


# Model Building

**Retrieval-only**

In [None]:
def faiss_search(query, k=3):
    qv = model.encode([query], normalize_embeddings=True, convert_to_numpy=True)
    D, I = index.search(qv.astype("float32"), k)
    results = store.iloc[I[0]].copy()
    results["score"] = D[0]
    return results

def retrieval_only(query, k=3):
    hits = faiss_search(query, k=k)
    # Return English + Sanskrit verses
    return "\n\n---\n\n".join(
        f"English: {row['english_text']}\nSanskrit: {row['sanskrit_text']}"
        for _, row in hits.iterrows()
    )

In [None]:
questions = [
    # Philosophy & Metaphysics
    "What happens to the soul after death?",
    "Why should I follow my dharma?",
    "How can I practice detachment in daily life?",
    "What does karma really mean for my future?",
    "How can knowledge help me reach peace?",

    # Emotion & Self-Mastery
    "How do I control my anger?",
    "What should I do when I feel afraid?",
    "How do I deal with grief after losing someone?",
    "Why do I feel so much hesitation before making a big decision?",
    "How can I overcome desire and temptation?",

    # Interpersonal & Leadership
    "How should a good leader behave?",
    "What is the right way to treat my teacher or mentor?",
    "How can I show respect to someone I look up to?",
    "How should I stay loyal to my friends?",
    "How can I balance my family responsibilities with my work?",

    # Therapy & Self-Growth
    "How do I handle stress when life feels overwhelming?",
    "Can meditation help me control my emotions?",
    "How do I let go of attachments that hurt me?",
    "What should I do when I feel stuck in a personal crisis?",
    "How can I become more resilient after failure?"
]


results = []
for q in questions:
    ans = retrieval_only(q, k=3)
    results.append({"question": q, "retrieval_answer": ans})

df_results = pd.DataFrame(results)
df_results.to_csv("retrieval_results.csv", index=False)

print("All retrieval-only answers saved to retrieval_results.csv")
df_results.head()


In [None]:
from google.colab import files

files.download('retrieval_results_final.csv')

Base Qwen and RAG+Qwen

In [None]:
!pip install -q faiss-cpu sentence-transformers transformers accelerate torch bitsandbytes


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "Qwen/Qwen2.5-3B-Instruct"

qwen_tok = AutoTokenizer.from_pretrained(model_name)
qwen_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16   # run in fp16 on GPU
)

In [None]:
def plain_qwen(query):
    prompt = (
        "Answer the question using only the passages below. "
        "Explain their meaning in clear, simple English, at least 3 sentences long. "
        "Use ancient Sanskrit scriptures Bhagavad Gita and Itihasa to answer"
        "skip the question if it takes too long"
        f"Question: {query}\nAnswer:"
    )
    inputs = qwen_tok(prompt, return_tensors="pt", truncation=True).to(qwen_model.device)
    outputs = qwen_model.generate(
        **inputs,
        do_sample=True, temperature=0.7, top_p=0.9,
        min_new_tokens=60, max_new_tokens=250
    )
    return qwen_tok.decode(outputs[0], skip_special_tokens=True)

In [None]:
plain_results = []
for q in questions:
    print("Plain Qwen:", q)
    plain_ans = plain_qwen(q)
    plain_results.append({"question": q, "plain_qwen_answer": plain_ans})

df_plain = pd.DataFrame(plain_results)
df_plain.to_csv("plain_qwen_results.csv", index=False)

from google.colab import files
files.download("plain_qwen_results.csv")

In [None]:
def rag_qwen(query, k=5):
    # Step 1: Retrieve passages
    hits = faiss_search(query, k=k)

    # Step 2: Build prompt with context
    prompt = build_prompt(query, hits)

    # Step 3: Tokenize and run through Qwen
    inputs = qwen_tok(prompt, return_tensors="pt", truncation=True).to(qwen_model.device)
    outputs = qwen_model.generate(
        **inputs,
        do_sample=True, temperature=0.7, top_p=0.9,
        min_new_tokens=60, max_new_tokens=250
    )

    # Step 4: Decode output
    return qwen_tok.decode(outputs[0], skip_special_tokens=True)

In [None]:
rag_results = []
for q in questions:
    print("RAG Qwen:", q)
    rag_ans = rag_qwen(q, k=3) #K=3
    rag_results.append({"question": q, "rag_qwen_answer": rag_ans})

df_rag = pd.DataFrame(rag_results)
df_rag.to_csv("rag_qwen_results_k_5_final.csv", index=False)

from google.colab import files
files.download("rag_qwen_results_k_5_final.csv")

In [None]:
rag_results = []
for q in questions:
    print("RAG Qwen:", q)
    rag_ans = rag_qwen(q, k=5) #K=5
    rag_results.append({"question": q, "rag_qwen_answer": rag_ans})

df_rag = pd.DataFrame(rag_results)
df_rag.to_csv("rag_qwen_results_k_5_final.csv", index=False)

from google.colab import files
files.download("rag_qwen_results_k_5_final.csv")

RAG+Gemini

In [None]:
!pip install google-generativeai
import google.generativeai as genai

genai.configure(api_key="INSERT YOUR API KEY HERE")

gemini = genai.GenerativeModel("gemini-2.5-flash")

In [None]:
def rag_gemini(query, k=5):
    hits = faiss_search(query, k=k)
    context = "\n\n".join(
        f"English: {row['english_text']}\nSanskrit: {row['sanskrit_text']}"
        for _, row in hits.iterrows()
    )

    prompt = (
        "Answer the question using only the passages below. "
        "Explain their meaning in clear, simple English, at least 3 sentences long. "
        "After your explanation, always include the exact Sanskrit verse(s) from the passages "
        "that support your answer. Do not omit them. "
        "Make sure to not just copy the passages, analyze and understand them and then reply in your own words."
        "skip the question if it takes too long"
        f"{context}\n\n"
        f"Question: {query}\nAnswer:"
    )

    response = gemini.generate_content(prompt)
    return response.text

In [None]:
#Answers in Colab itself
for q in questions:
    print("="*80)
    print("Q:", q)
    print("A:", rag_gemini(q, k=3))
    print("\n")