# CS 5588 — Enhanced RAG + Gemini + Fine-Tuning on Online Dataset
_Generated: 2025-09-14T13:53:05_

### 1) Install

In [1]:

!pip -q install -U langchain langchain-community chromadb pypdf             sentence-transformers transformers datasets evaluate peft accelerate tiktoken             langchain-google-genai google-genai
print("If upgraded core libs, consider restarting runtime.")


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.5/310.5 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m503.6/503.6 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.6 MB/s[0m eta [36

### 2) Keys & Imports

In [2]:

import os, getpass, json, sys, platform, pathlib, datetime, importlib, torch
if not os.getenv("GEMINI_API_KEY"):
    os.environ["GEMINI_API_KEY"] = getpass.getpass("Enter your GEMINI_API_KEY: ")
os.environ["GOOGLE_API_KEY"] = os.environ.get("GOOGLE_API_KEY", os.environ["GEMINI_API_KEY"])

from google import genai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.llms.huggingface_pipeline import HuggingFacePipeline

from datasets import load_dataset
import evaluate
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Trainer, TrainingArguments, pipeline
from peft import LoraConfig, get_peft_model, PeftModel

pathlib.Path("data").mkdir(exist_ok=True)
pathlib.Path("artifacts/ft").mkdir(parents=True, exist_ok=True)
print("Env ready.")


Enter your GEMINI_API_KEY: ··········
Env ready.


### 3) Env log → env_rag.json

In [3]:

def pv(m):
    try:
        import importlib
        mod = importlib.import_module(m)
        return getattr(mod, "__version__", "unknown")
    except: return "not installed"
env = {
  "timestamp": datetime.datetime.now().isoformat(),
  "python": sys.version, "platform": platform.platform(),
  "cuda_available": torch.cuda.is_available(),
  "packages": {m: pv(m) for m in [
    "langchain","langchain_community","chromadb","tiktoken","transformers",
    "datasets","evaluate","peft","sentence_transformers",
    "langchain_google_genai","google.genai"
  ]}
}
json.dump(env, open("env_rag.json","w"), indent=2)
print(json.dumps(env, indent=2))


{
  "timestamp": "2025-09-19T04:10:15.592488",
  "python": "3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]",
  "platform": "Linux-6.1.123+-x86_64-with-glibc2.35",
  "cuda_available": true,
  "packages": {
    "langchain": "0.3.27",
    "langchain_community": "0.3.29",
    "chromadb": "1.1.0",
    "tiktoken": "0.11.0",
    "transformers": "4.56.1",
    "datasets": "4.1.1",
    "evaluate": "0.4.6",
    "peft": "0.17.1",
    "sentence_transformers": "5.1.0",
    "langchain_google_genai": "unknown",
    "google.genai": "1.38.0"
  }
}


### 4) Upload & Load project docs, Chunk, Build Chroma

In [4]:

# Upload
try:
    from google.colab import files
    up = files.upload()
    import os
    for n,c in up.items():
        open(os.path.join("data", n), "wb").write(c)
    print("Uploaded:", list(up.keys()))
except Exception as e:
    print("Colab upload UI not available.", e)

# Load
import os
def load_docs(folder="data"):
    docs=[]
    for fname in os.listdir(folder):
        p=os.path.join(folder,fname)
        if not os.path.isfile(p): continue
        ext=fname.lower().split(".")[-1]
        try:
            if ext=="pdf": loader=PyPDFLoader(p)
            elif ext in ["txt","md","markdown"]: loader=TextLoader(p, encoding="utf-8")
            else: print("Skip", fname); continue
            docs += loader.load()
        except Exception as e:
            print("Fail", fname, e)
    return docs

raw_docs=load_docs("data")
splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
splits=splitter.split_documents(raw_docs)

# Chroma
emb = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vs = Chroma.from_documents(splits, embedding=emb, persist_directory="./chroma_minilm")
vs.persist()
retriever = vs.as_retriever(search_kwargs={"k":4})
print("Docs:", len(raw_docs), "Chunks:", len(splits))


Saving paper1.pdf to paper1.pdf
Saving paper2.pdf to paper2.pdf
Saving paper3.pdf to paper3.pdf
Uploaded: ['paper1.pdf', 'paper2.pdf', 'paper3.pdf']


  emb = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Docs: 97 Chunks: 667


  vs.persist()


### 5) RAG Chains: Gemini & Local FLAN-T5 (pre-FT)

In [5]:

llm_gemini = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.2)
qa_gemini = RetrievalQA.from_chain_type(llm=llm_gemini, chain_type="stuff", retriever=retriever, return_source_documents=True)

base_model = "google/flan-t5-small"
tok = AutoTokenizer.from_pretrained(base_model)
base = AutoModelForSeq2SeqLM.from_pretrained(base_model)
pipe_base = pipeline("text2text-generation", model=base, tokenizer=tok, max_new_tokens=256, device=0 if torch.cuda.is_available() else -1)
llm_local = HuggingFacePipeline(pipeline=pipe_base)
qa_local = RetrievalQA.from_chain_type(llm=llm_local, chain_type="stuff", retriever=retriever)

def ask(chain, q):
    r=chain({"query": q})
    print("\nQ:", q); print("A:", r.get("result",""))
ask(qa_gemini, "What is the obejective of paper1?")
ask(qa_local, "Summarize paper2.")


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0
  llm_local = HuggingFacePipeline(pipeline=pipe_base)
  r=chain({"query": q})



Q: What is the obejective of paper1?
A: The objective of paper "o1" (which seems to be a reference to a paper) is "Opportunities and challenges of agi".

Q: Summarize paper2.
A: II.


### 6) Fine-Tune on online dataset (Hugging Face `squad`, sampled) with LoRA

In [7]:
from datasets import load_dataset
dataset_name = "my_papers.jsonl" # Define dataset_name
seed = 42 # Define seed
train_n = 800 # Define train_n
eval_n = 200 # Define eval_n
ds = load_dataset("json", data_files={"train":"/content/my_papers.jsonl", "validation":"/content/my_papers.jsonl"})

ds_tr = ds["train"].shuffle(seed=seed).select(range(min(train_n, len(ds["train"]))))
ds_ev = ds["validation"].shuffle(seed=seed).select(range(min(eval_n, len(ds["validation"]))))

def preprocess(ex):
    ctx, q = ex["context"], ex["question"]
    ans = ex["answers"]["text"][0] if ex["answers"]["text"] else ""
    prompt = f"Use the context to answer concisely.\nContext: {ctx}\nQuestion: {q}\nAnswer:"
    model_in = tok(prompt, truncation=True, max_length=512)
    with tok.as_target_tokenizer():
        labels = tok(ans, truncation=True, max_length=64)
    model_in["labels"] = labels["input_ids"]
    model_in["id"] = ex["id"]
    return model_in

proc_tr = ds_tr.map(preprocess, remove_columns=ds_tr.column_names)
proc_ev = ds_ev.map(preprocess, remove_columns=ds_ev.column_names)
collator = DataCollatorForSeq2Seq(tokenizer=tok, model=base)

lora = LoraConfig(r=16, lora_alpha=32, target_modules=["q","k","v","o"], lora_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM")
ft_model = get_peft_model(base, lora)
out_dir="artifacts/ft/flan_t5_small_lora"
args = TrainingArguments(output_dir=out_dir, num_train_epochs=1, per_device_train_batch_size=4,
                         per_device_eval_batch_size=4, learning_rate=5e-4, logging_steps=50,
                         eval_strategy="steps", eval_steps=200, save_steps=200,
                         save_total_limit=2,
                         bf16=torch.cuda.is_available(), fp16=False)

trainer = Trainer(model=ft_model, args=args, train_dataset=proc_tr, eval_dataset=proc_ev,
                  data_collator=collator, tokenizer=tok)
trainer.train()
mets = trainer.evaluate()
print(mets)

trainer.model.save_pretrained(out_dir)
tok.save_pretrained(out_dir)

ft_cfg={"base_model":base_model,"adapter_dir":out_dir,"dataset":dataset_name,
        "train_n":train_n,"eval_n":eval_n,"seed":seed}
json.dump(ft_cfg, open("ft_config.json","w"), indent=2)
print(json.dumps(ft_cfg, indent=2))

Map:   0%|          | 0/9 [00:00<?, ? examples/s]



Map:   0%|          | 0/9 [00:00<?, ? examples/s]

  trainer = Trainer(model=ft_model, args=args, train_dataset=proc_tr, eval_dataset=proc_ev,


Step,Training Loss,Validation Loss


{'eval_loss': 2.827467441558838, 'eval_runtime': 0.164, 'eval_samples_per_second': 54.871, 'eval_steps_per_second': 18.29, 'epoch': 1.0}
{
  "base_model": "google/flan-t5-small",
  "adapter_dir": "artifacts/ft/flan_t5_small_lora",
  "dataset": "my_papers.jsonl",
  "train_n": 800,
  "eval_n": 200,
  "seed": 42
}


### 7) Merge & Evaluate (EM/F1 quick subset)

In [10]:
import evaluate
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
from peft import PeftModel
import torch

# Load SQuAD metric (still works for F1/EM)
metric = evaluate.load("squad")

# Load fine-tuned model
ft_loaded = AutoModelForSeq2SeqLM.from_pretrained(base_model)
ft_loaded = PeftModel.from_pretrained(ft_loaded, model_id="artifacts/ft/flan_t5_small_lora")
ft_loaded = ft_loaded.merge_and_unload()

# Create pipeline
pipe_ft = pipeline(
    "text2text-generation",
    model=ft_loaded,
    tokenizer=tok,
    max_new_tokens=64,
    device=0 if torch.cuda.is_available() else -1
)

# Collect predictions & references
preds, refs = [], []
n = min(100, len(ds_ev))   # evaluate up to 100 examples

for ex in ds_ev.select(range(n)):
    # Build prompt
    prompt = f"Use the context to answer concisely.\nContext: {ex['context']}\nQuestion: {ex['question']}\nAnswer:"

    # Model prediction
    pred = pipe_ft(prompt)[0]["generated_text"].strip()

    # Ground truth answers
    golds = ex["answers"]["text"] if ex["answers"]["text"] else [""]

    # Store
    preds.append({"id": ex["id"], "prediction_text": pred})
    refs.append({
        "id": ex["id"],
        "answers": {"text": golds, "answer_start": [0]*len(golds)}  # dummy starts
    })

# Compute metrics
results = metric.compute(predictions=preds, references=refs)
print("Evaluation Results:", results)


Device set to use cuda:0


Evaluation Results: {'exact_match': 0.0, 'f1': 24.4538805965975}


In [11]:
print(ds_ev)
print(ds_ev[0])

Dataset({
    features: ['id', 'context', 'question', 'answers'],
    num_rows: 9
})
{'id': 'p2_q1', 'context': 'TradingAgents is a multi-agent LLM framework for financial trading, inspired by real-world trading firms. It features specialized agents like fundamental analysts, sentiment analysts, news analysts, technical analysts, traders, and risk managers.', 'question': 'What is the main contribution of Paper2?', 'answers': {'text': ['It introduces TradingAgents, a multi-agent framework where LLM-powered agents in different roles collaborate to make better trading decisions.']}}


### 8) Plug FT model into RAG and compare to Gemini

In [12]:

from langchain.llms.huggingface_pipeline import HuggingFacePipeline
llm_ft = HuggingFacePipeline(pipeline=pipe_ft)
qa_ft = RetrievalQA.from_chain_type(llm=llm_ft, chain_type="stuff", retriever=retriever)

qs = [
    "Summarize the dataset assumptions made in the uploaded materials.",
    "Identify two limitations or open problems noted in the documents."
]
print("\n=== RAG: Gemini ===")
for q in qs: print("\nQ:", q, "\nA:", RetrievalQA.from_chain_type(llm=llm_gemini, chain_type='stuff', retriever=retriever).invoke({'query': q})['result'][:800])
print("\n=== RAG: Fine-tuned FLAN-T5 ===")
for q in qs: print("\nQ:", q, "\nA:", qa_ft.invoke({'query': q})['result'][:800])



=== RAG: Gemini ===

Q: Summarize the dataset assumptions made in the uploaded materials. 
A: Based on the provided materials, the assumptions made about the dataset (specifically the multi-modal stock market dataset) are:

1.  **Specific Time Frame:** The historical stock prices are assumed to be relevant and available only for the period from **January 1st, 2024, to March 29th, 2024**.
2.  **Selected Stocks:** The dataset focuses on a selection of "various stocks such as Apple, Nvidia, Microsoft, Meta, Google, and more," implying it does not cover the entire market.
3.  **Inclusion of Specific Data Modalities:** It is assumed that the integration of "historical stock prices, news articles, social media sentiment, insider transactions, financial statements, and 60 technical indicators per asset" provides a comprehensive view for its intended use.
4.  **Relevance of Technical Indic

Q: Identify two limitations or open problems noted in the documents. 
A: Here are two limitations or op

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



Q: Identify two limitations or open problems noted in the documents. 
A: Identify two limitations or open problems noted in the documents.


### 9) Save/Update configs

In [13]:

rag_cfg = {
  "chunk_settings_tested":[{"chunk_size":500,"chunk_overlap":100},{"chunk_size":300,"chunk_overlap":50}],
  "embedding_models_tested":["sentence-transformers/all-MiniLM-L6-v2"],
  "llm":{"providers":["google-genai","hf-local"],"models":["gemini-2.5-flash","google/flan-t5-small (LoRA)"]},
  "retriever_k":4,
  "finetune": json.load(open("ft_config.json"))
}
json.dump(rag_cfg, open("rag_run_config.json","w"), indent=2)
print(json.dumps(rag_cfg, indent=2))


{
  "chunk_settings_tested": [
    {
      "chunk_size": 500,
      "chunk_overlap": 100
    },
    {
      "chunk_size": 300,
      "chunk_overlap": 50
    }
  ],
  "embedding_models_tested": [
    "sentence-transformers/all-MiniLM-L6-v2"
  ],
  "llm": {
    "providers": [
      "google-genai",
      "hf-local"
    ],
    "models": [
      "gemini-2.5-flash",
      "google/flan-t5-small (LoRA)"
    ]
  },
  "retriever_k": 4,
  "finetune": {
    "base_model": "google/flan-t5-small",
    "adapter_dir": "artifacts/ft/flan_t5_small_lora",
    "dataset": "my_papers.jsonl",
    "train_n": 800,
    "eval_n": 200,
    "seed": 42
  }
}


### 10) Notes
- Use GPU runtime in Colab for fine-tuning.
- Keep keys out of version control.
- Increase dataset size/epochs for stronger results.