# CS 5588 — Enhanced RAG + Gemini + Fine-Tuning on Online Dataset
_Generated: 2025-09-14T13:53:05_

### 1) Install

In [1]:
!pip -q install -U langchain langchain-community chromadb pypdf \
                   sentence-transformers transformers datasets evaluate peft accelerate tiktoken \
                   langchain-google-genai google-genai
print("If core libs upgraded, consider Runtime > Restart runtime.")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m74.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.5/310.5 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m503.6/503.6 kB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.5 MB/s[0m eta [36

### 2) Keys & Imports

In [2]:
import os, getpass, json, sys, platform, pathlib, datetime, importlib, hashlib, torch

# Reset & set ONE AI Studio key (starts with 'AIza...')
for k in ["GOOGLE_API_KEY", "GEMINI_API_KEY"]:
    os.environ.pop(k, None)
os.environ["GOOGLE_API_KEY"] = getpass.getpass("Gemini API key: AIzaSyDW34r4FhWpC74T_ACUIiqvaRfhkaF9GCU").strip()

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
from langchain.llms.huggingface_pipeline import HuggingFacePipeline

from datasets import load_dataset
import evaluate
from transformers import (AutoModelForSeq2SeqLM, AutoTokenizer,
                          DataCollatorForSeq2Seq, Trainer, TrainingArguments, pipeline)
from peft import LoraConfig, get_peft_model, PeftModel

pathlib.Path("data").mkdir(exist_ok=True)
pathlib.Path("artifacts/ft").mkdir(parents=True, exist_ok=True)
print("Env ready.")

Gemini API key: AIzaSyDW34r4FhWpC74T_ACUIiqvaRfhkaF9GCU··········
Env ready.


In [3]:
from google import genai
try:
    _client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])
    _resp = _client.models.generate_content(model="gemini-1.5-flash", contents="pong?")
    print("Gemini ping:", _resp.text[:40])
except Exception as e:
    print("Gemini ping failed:", e)

Gemini ping: Pong is a table tennis-inspired arcade g


### 3) Env log → env_rag.json

In [4]:
def pv(m):
    try:
        import importlib
        mod = importlib.import_module(m)
        return getattr(mod, "__version__", "unknown")
    except:
        return "not installed"

env = {
  "timestamp": datetime.datetime.now().isoformat(),
  "python": sys.version,
  "platform": platform.platform(),
  "cuda_available": torch.cuda.is_available(),
  "packages": {m: pv(m) for m in [
    "langchain","langchain_community","chromadb","tiktoken","transformers",
    "datasets","evaluate","peft","sentence_transformers","langchain_google_genai","google.genai"
  ]}
}
json.dump(env, open("env_rag.json","w"), indent=2)
print(json.dumps(env, indent=2))

{
  "timestamp": "2025-09-19T04:28:24.204674",
  "python": "3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]",
  "platform": "Linux-6.1.123+-x86_64-with-glibc2.35",
  "cuda_available": true,
  "packages": {
    "langchain": "0.3.27",
    "langchain_community": "0.3.29",
    "chromadb": "1.1.0",
    "tiktoken": "0.11.0",
    "transformers": "4.56.1",
    "datasets": "4.1.1",
    "evaluate": "0.4.6",
    "peft": "0.17.1",
    "sentence_transformers": "5.1.0",
    "langchain_google_genai": "unknown",
    "google.genai": "1.38.0"
  }
}


### 4) Upload & Load project docs, Chunk, Build Chroma

In [5]:
try:
    from google.colab import files
    up = files.upload()
    for n,c in up.items():
        open(os.path.join("data", n), "wb").write(c)
    print("Uploaded:", list(up.keys()))
except Exception as e:
    print("Colab upload UI not available:", e)

# Load docs
def load_docs(folder="data"):
    docs=[]
    for fname in os.listdir(folder):
        p=os.path.join(folder,fname)
        if not os.path.isfile(p):
            continue
        ext=fname.lower().split(".")[-1]
        try:
            if ext=="pdf":
                loader=PyPDFLoader(p)
            elif ext in ["txt","md","markdown"]:
                loader=TextLoader(p, encoding="utf-8")
            else:
                print("Skip", fname);
                continue
            docs += loader.load()
        except Exception as e:
            print("Fail", fname, e)
    return docs

raw_docs = load_docs("data")
assert len(raw_docs) > 0, "No supported documents loaded. Upload at least one PDF/TXT."

# Default chunking (500 / 100)
CHUNK_SIZE, CHUNK_OVERLAP = 500, 100
splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
splits = splitter.split_documents(raw_docs)
print("Docs:", len(raw_docs), "Chunks:", len(splits))

# Chroma with MiniLM (baseline embeddings)
from shutil import rmtree
emb_minilm = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
rmtree("./chroma_minilm", ignore_errors=True)
vs = Chroma.from_documents(splits, embedding=emb_minilm, persist_directory="./chroma_minilm")
vs.persist()
retriever = vs.as_retriever(search_kwargs={"k":4})
print("Vector store ready (MiniLM, 500/100).")

Saving NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-behavior-Paper-Conference.pdf to NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-behavior-Paper-Conference.pdf
Saving NeurIPS-2024-richelieu-self-evolving-llm-based-agents-for-ai-diplomacy-Paper-Conference.pdf to NeurIPS-2024-richelieu-self-evolving-llm-based-agents-for-ai-diplomacy-Paper-Conference.pdf
Saving NeurIPS-2024-mdagents-an-adaptive-collaboration-of-llms-for-medical-decision-making-Paper-Conference.pdf to NeurIPS-2024-mdagents-an-adaptive-collaboration-of-llms-for-medical-decision-making-Paper-Conference.pdf
Uploaded: ['NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-behavior-Paper-Conference.pdf', 'NeurIPS-2024-richelieu-self-evolving-llm-based-agents-for-ai-diplomacy-Paper-Conference.pdf', 'NeurIPS-2024-mdagents-an-adaptive-collaboration-of-llms-for-medical-decision-making-Paper-Conference.pdf']
Docs: 126 Chunks: 1066


  emb_minilm = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Vector store ready (MiniLM, 500/100).


  vs.persist()


### 5) RAG Chains: Gemini & Local FLAN-T5 (pre-FT)

In [6]:
llm_gemini = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0.2,
    api_key=os.environ["GOOGLE_API_KEY"],
)

qa_gemini = RetrievalQA.from_chain_type(
    llm=llm_gemini, chain_type="stuff", retriever=retriever, return_source_documents=True
)

# Local FLAN-T5 (pre-FT)
base_model = "google/flan-t5-small"
tok = AutoTokenizer.from_pretrained(base_model)
base = AutoModelForSeq2SeqLM.from_pretrained(base_model)
pipe_base = pipeline(
    "text2text-generation", model=base, tokenizer=tok, max_new_tokens=256,
    device=0 if torch.cuda.is_available() else -1
)
llm_local = HuggingFacePipeline(pipeline=pipe_base)
qa_local = RetrievalQA.from_chain_type(llm=llm_local, chain_type="stuff", retriever=retriever)

def ask(chain, q):
    r = chain.invoke({"query": q})
    print("\nQ:", q); print("A:", r.get("result",""))
    if "source_documents" in r:
        srcs = [d.metadata.get("source","?") for d in r["source_documents"][:3]]
        print("Sources:", srcs)

# Quick, distinct one-liners
ask(qa_gemini, "Define Trust Rate in the MAP Trust Game in one sentence.")
ask(qa_local,  "What do the Moderator and Recruiter do in MDAgents?")

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0
  llm_local = HuggingFacePipeline(pipeline=pipe_base)



Q: Define Trust Rate in the MAP Trust Game in one sentence.
A: Trust Rate is the percentage of trustors who choose to trust the trustee, given a specific probability (p) of the trustee also choosing to trust.
Sources: ['data/NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-behavior-Paper-Conference.pdf', 'data/NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-behavior-Paper-Conference.pdf', 'data/NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-behavior-Paper-Conference.pdf']

Q: What do the Moderator and Recruiter do in MDAgents?
A: Answer the question: 3.1 Agent Roles Moderator. The moderator agent functions as a general practitioner (GP) or emergency department doctor who first triages the medical query. This agent assesses the complexity of the problem and determines whether it should be handled by a single agent, a MDT, or an ICT. The moderator ensures the appropriate pathway be selected based on the query’s complexity and oversee

# Chunk Sensitivity

In [7]:
splitter_small = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
splits_small = splitter_small.split_documents(raw_docs)
print("Small-chunk count:", len(splits_small))

from shutil import rmtree
rmtree("./chroma_minilm_small", ignore_errors=True)
vs_small = Chroma.from_documents(splits_small, embedding=emb_minilm, persist_directory="./chroma_minilm_small")
vs_small.persist()
retriever_small = vs_small.as_retriever(search_kwargs={"k":4})

qa_gemini_small = RetrievalQA.from_chain_type(
    llm=llm_gemini, chain_type="stuff", retriever=retriever_small, return_source_documents=True
)

r_def = qa_gemini.invoke({"query":"Name the four stages of MDAgents in order."})
r_small = qa_gemini_small.invoke({"query":"Name the four stages of MDAgents in order."})
print("\n[500/100]:", r_def["result"][:220])
print("[300/50]:", r_small["result"][:220])

Small-chunk count: 1667

[500/100]: The four stages of MDAgents are: 1) Medical complexity check; 2) Recruitment based on medical complexity; 3) Analysis and synthesis; 4) Final decision-making.
[300/50]: The provided text mentions three stages of MDAgents: 1) Medical complexity check; 2)  (The second stage is not fully specified); and 3) (The third stage is also not fully specified).  A fourth stage is not named.


### 6) Fine-Tune on online dataset (Hugging Face `squad`, sampled) with LoRA

In [11]:
# =========================
# 6) Fine-Tune on SQuAD with LoRA (W&B disabled, v4/v5-safe)
# =========================
import os
from inspect import signature
from datasets import load_dataset
from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForSeq2SeqLM

# --- Disable W&B + misc telemetry ---
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_SILENT"] = "true"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

dataset_name = "squad"
train_n, eval_n, seed = 800, 200, 42

# 1) Load & sample
ds = load_dataset(dataset_name)
ds_tr = ds["train"].shuffle(seed=seed).select(range(min(train_n, len(ds["train"]))))
ds_ev = ds["validation"].shuffle(seed=seed).select(range(min(eval_n, len(ds["validation"]))))

# 2) Preprocess (no as_target_tokenizer; use text_target)
def preprocess(ex):
    ctx, q = ex["context"], ex["question"]
    ans = ex["answers"]["text"][0] if ex["answers"]["text"] else ""
    prompt = f"Use the context to answer concisely.\nContext: {ctx}\nQuestion: {q}\nAnswer:"
    model_in = tok(prompt, truncation=True, max_length=512)
    labels = tok(text_target=ans, truncation=True, max_length=64)
    model_in["labels"] = labels["input_ids"]
    model_in["id"] = ex["id"]
    return model_in

proc_tr = ds_tr.map(preprocess, remove_columns=ds_tr.column_names)
proc_ev = ds_ev.map(preprocess, remove_columns=ds_ev.column_names)

# 3) Collator
collator = DataCollatorForSeq2Seq(tokenizer=tok, model=base)

# 4) Fresh base (avoid double-PEFT) + LoRA
base_ft = AutoModelForSeq2SeqLM.from_pretrained(base_model)
lora = LoraConfig(
    r=16, lora_alpha=32, target_modules=["q","k","v","o"],
    lora_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM"
)
ft_model = get_peft_model(base_ft, lora)
out_dir = "artifacts/ft/flan_t5_small_lora"

# 5) TrainingArguments (v4/v5 compatibility + no reporting)
TA = TrainingArguments
ta_sig = signature(TA).parameters

args_kwargs = dict(
    output_dir=out_dir,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-4,
    logging_steps=50,
    eval_steps=200,
    save_steps=200,
    save_total_limit=2,
    bf16=bool(torch.cuda.is_available()),
    fp16=False,
)

# v4 uses evaluation_strategy; v5 uses eval_strategy
if "evaluation_strategy" in ta_sig:
    args_kwargs["evaluation_strategy"] = "steps"
elif "eval_strategy" in ta_sig:
    args_kwargs["eval_strategy"] = "steps"

# Disable external reporting entirely
if "report_to" in ta_sig:
    args_kwargs["report_to"] = "none"   # or [] in some versions

# Only set if supported in your version
if "predict_with_generate" in ta_sig:
    args_kwargs["predict_with_generate"] = True

args = TA(**args_kwargs)

# 6) Train + quick eval
trainer = Trainer(
    model=ft_model,
    args=args,
    train_dataset=proc_tr,
    eval_dataset=proc_ev,
    data_collator=collator,
    tokenizer=tok,   # OK (warning only); v5 prefers processing_class
)
trainer.train()
mets = trainer.evaluate()
print("Eval metrics (LoRA on SQuAD):", mets)

# 7) Save adapter + tokenizer + run cfg
trainer.model.save_pretrained(out_dir)
tok.save_pretrained(out_dir)

import json
ft_cfg = {
    "base_model": base_model,
    "adapter_dir": out_dir,
    "dataset": dataset_name,
    "train_n": train_n,
    "eval_n": eval_n,
    "seed": seed,
}
json.dump(ft_cfg, open("ft_config.json","w"), indent=2)
print(json.dumps(ft_cfg, indent=2))


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss,Validation Loss
200,0.6184,0.425682


Eval metrics (LoRA on SQuAD): {'eval_loss': 0.4256817698478699, 'eval_runtime': 2.7943, 'eval_samples_per_second': 71.573, 'eval_steps_per_second': 17.893, 'epoch': 1.0}
{
  "base_model": "google/flan-t5-small",
  "adapter_dir": "artifacts/ft/flan_t5_small_lora",
  "dataset": "squad",
  "train_n": 800,
  "eval_n": 200,
  "seed": 42
}


### 7) Merge & Evaluate (EM/F1 quick subset)

In [12]:
import evaluate
from transformers import AutoModelForSeq2SeqLM, pipeline
from peft import PeftModel

out_dir = "artifacts/ft/flan_t5_small_lora"

# Load base, attach LoRA adapter, then merge into a standalone model
ft_loaded = AutoModelForSeq2SeqLM.from_pretrained(base_model)
ft_loaded = PeftModel.from_pretrained(ft_loaded, model_id=out_dir)
ft_loaded = ft_loaded.merge_and_unload()  # final merged weights

# Simple generation pipeline for evaluation
pipe_ft = pipeline(
    "text2text-generation",
    model=ft_loaded,
    tokenizer=tok,
    max_new_tokens=64,
    device=0 if torch.cuda.is_available() else -1
)

# Quick EM/F1 on a small subset
metric = evaluate.load("squad")
preds, refs = [], []
n = min(100, len(ds_ev))

for ex in ds_ev.select(range(n)):
    prompt = (
        "Use the context to answer concisely.\n"
        f"Context: {ex['context']}\n"
        f"Question: {ex['question']}\n"
        "Answer:"
    )
    pred = pipe_ft(prompt)[0]["generated_text"].strip()
    golds = ex["answers"]["text"] if ex["answers"]["text"] else [""]

    preds.append({"id": ex["id"], "prediction_text": pred})
    refs.append({
        "id": ex["id"],
        "answers": {
            "text": golds,
            "answer_start": ex["answers"]["answer_start"]
        }
    })

scores = metric.compute(predictions=preds, references=refs)
print("EM/F1:", scores)

Device set to use cuda:0


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


EM/F1: {'exact_match': 79.0, 'f1': 86.98520923520924}


### 8) Plug FT model into RAG and compare to Gemini

In [13]:
import os, json, datetime
from langchain.chains import RetrievalQA
from langchain.llms.huggingface_pipeline import HuggingFacePipeline

# Build FT LLM wrapper and QA chain (reuse the retriever built earlier)
llm_ft = HuggingFacePipeline(pipeline=pipe_ft)
qa_ft = RetrievalQA.from_chain_type(
    llm=llm_ft, chain_type="stuff", retriever=retriever, return_source_documents=True
)

# Ensure we have a Gemini chain; if not created earlier, create it now
try:
    qa_gemini
except NameError:
    from langchain_google_genai import ChatGoogleGenerativeAI
    llm_gemini = ChatGoogleGenerativeAI(
        model="gemini-1.5-flash",
        temperature=0.2,
        api_key=os.environ.get("GOOGLE_API_KEY", "")
    )
    qa_gemini = RetrievalQA.from_chain_type(
        llm=llm_gemini, chain_type="stuff", retriever=retriever, return_source_documents=True
    )

# Short, paper-specific questions (feel free to tweak)
qs = [
    "State what VRR checks for in the Trust-Game paper, in one line.",
    "Name the four stages of MDAgents in exact order.",
    "Give a one-line summary of how Richelieu self-evolves via self-play."
]

def run_and_show(qa_chain, label, q):
    r = qa_chain.invoke({"query": q})
    ans = r.get("result", "").strip()
    srcs = [d.metadata.get("source", "?") for d in r.get("source_documents", [])[:3]]
    print(f"\n[{label}] Q: {q}\nA: {ans}\nSources: {srcs}")
    return {"answer": ans, "sources": srcs}

# Compare and capture results
comparison = {"timestamp": datetime.datetime.now().isoformat(), "questions": []}
print("\n=== RAG Comparison: Gemini vs Fine-tuned FLAN-T5 ===")
for q in qs:
    gem = run_and_show(qa_gemini, "Gemini", q)
    ft  = run_and_show(qa_ft,     "FT-FLAN", q)
    comparison["questions"].append({"q": q, "gemini": gem, "ft_flan": ft})

# Save a record for your write-up
os.makedirs("artifacts/ft", exist_ok=True)
with open("artifacts/ft/rag_comparison.json", "w") as f:
    json.dump(comparison, f, indent=2)
print("\nSaved: artifacts/ft/rag_comparison.json")


=== RAG Comparison: Gemini vs Fine-tuned FLAN-T5 ===


Token indices sequence length is longer than the specified maximum sequence length for this model (635 > 512). Running this sequence through the model will result in indexing errors



[Gemini] Q: State what VRR checks for in the Trust-Game paper, in one line.
A: VRR checks the percentage of valid responses given by different LLMs in the Trust Game.
Sources: ['data/NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-behavior-Paper-Conference.pdf', 'data/NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-behavior-Paper-Conference.pdf', 'data/NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-behavior-Paper-Conference.pdf']

[FT-FLAN] Q: State what VRR checks for in the Trust-Game paper, in one line.
A: LLM agents manifest trust be- havior by letting LLM agents play the Trust Game (Section 2.1 Game 1). In Behavioral Economics, trust is widely measured by the initial amount sent from the trustor to the trustee in the Trust Game (Glaeser et
Sources: ['data/NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-behavior-Paper-Conference.pdf', 'data/NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-beh

### 9) Save/Update configs

In [14]:
import os, json, hashlib, datetime

def file_info(path):
    with open(path, "rb") as f:
        b = f.read()
    return {
        "name": os.path.basename(path),
        "bytes": len(b),
        "sha256": hashlib.sha256(b).hexdigest()
    }

# Record all files in data/ (PDFs/TXTs you uploaded)
files_used = []
if os.path.exists("data"):
    for root, _, files in os.walk("data"):
        for fname in files:
            p = os.path.join(root, fname)
            if os.path.isfile(p):
                files_used.append(file_info(p))

# Build final config dict
rag_cfg = {
  "timestamp": datetime.datetime.now().isoformat(),
  "files_used": files_used,
  "chunk_settings_tested": [
      {"chunk_size": 500, "chunk_overlap": 100},
      {"chunk_size": 300, "chunk_overlap": 50}
  ],
  "embedding_models_tested": ["sentence-transformers/all-MiniLM-L6-v2"],
  "vectorstores": [d for d in ["chroma_minilm", "chroma_minilm_small"] if os.path.exists(d)],
  "llm": {
      "providers": ["google-genai", "hf-local"],
      "models": ["gemini-1.5-flash", "google/flan-t5-small (LoRA)"]
  },
  "retriever_k": 4
}

# Include FT config if present
if os.path.exists("ft_config.json"):
    try:
        rag_cfg["finetune"] = json.load(open("ft_config.json"))
    except Exception as e:
        rag_cfg["finetune"] = {"error": f"Could not read ft_config.json: {e}"}

# Include RAG comparison path if present
cmp_path = "artifacts/ft/rag_comparison.json"
if os.path.exists(cmp_path):
    rag_cfg["comparison_log"] = cmp_path

# Save rag_run_config.json
with open("rag_run_config.json", "w") as f:
    json.dump(rag_cfg, f, indent=2)

print(json.dumps(rag_cfg, indent=2))
print("\nSaved: rag_run_config.json")

{
  "timestamp": "2025-09-19T04:58:34.335655",
  "files_used": [
    {
      "name": "NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-behavior-Paper-Conference.pdf",
      "bytes": 5121261,
      "sha256": "a729dec371ccdf5705d89915d720271d1bdd35a1cf3998c7db65db8891c51140"
    },
    {
      "name": "NeurIPS-2024-richelieu-self-evolving-llm-based-agents-for-ai-diplomacy-Paper-Conference.pdf",
      "bytes": 2331306,
      "sha256": "74788fa9a33b3926aac6d275ae299e61cec918abc6525a42baeb56eb5204b53c"
    },
    {
      "name": "NeurIPS-2024-mdagents-an-adaptive-collaboration-of-llms-for-medical-decision-making-Paper-Conference.pdf",
      "bytes": 7320523,
      "sha256": "ee5165fd72486bef7039a6810b29d3f9e852c06bb27e9b1048533fa250f53887"
    }
  ],
  "chunk_settings_tested": [
    {
      "chunk_size": 500,
      "chunk_overlap": 100
    },
    {
      "chunk_size": 300,
      "chunk_overlap": 50
    }
  ],
  "embedding_models_tested": [
    "sentence-transformers/all-Min

### 10) Notes
- Use GPU runtime in Colab for fine-tuning.
- Keep keys out of version control.
- Increase dataset size/epochs for stronger results.