In [1]:
# ====================
# 1. Setup & Imports
# ====================
import pandas as pd
import os
import numpy as np
import spacy
import re
import pinecone
from dotenv import load_dotenv
import torch
from pinecone import Pinecone
from tqdm.auto import tqdm
load_dotenv()
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("sentencizer")
device = "cuda" if torch.cuda.is_available() else "cpu"
embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#=====================
#3. llm setup
#=====================
# setup/libraries
from transformers import BitsAndBytesConfig
from transformers.utils import is_flash_attn_2_available
from sentence_transformers import SentenceTransformer, util
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

In [3]:
class PineconeRetriever:
    def __init__(self, index_name="datatonic-rags", embedding_model=None):
        vector_db = os.getenv("PINECONE_API_KEY")
        self.pc = pinecone.Pinecone(api_key=vector_db)
        self.index = self.pc.Index(index_name)
        self.embedding_model = embedding_model
    def query(self, query: str, top_k: int = 1):
        query_embedding = self.embedding_model.encode(query).tolist()
        results = self.index.query(
            vector=query_embedding,
            top_k=top_k,
            include_metadata=True
        )
        chunks_with_scores = [(match.metadata["text"], match.score) for match in results.matches]
        return chunks_with_scores

In [4]:
retrieval = PineconeRetriever(embedding_model = embedding_model)

In [5]:
user_query = "what was the increase in the operating profit for space systems from 2011 to 2012?"
relevant_chunks = retrieval.query(user_query)
print(relevant_chunks)

[(['| 13.0% ( 13.0 % )\nbacklog at year-end | $ 18900          | $ 20500          | $ 18100         \n\n2014 compared to 2013 space systems 2019 net sales for 2014 increased $ 107 million , or 1% ( 1 % ) , compared to 2013 .\n', 'the increase was primarily attributable to higher net sales of approximately $ 340 million for the orion program due to increased volume ( primarily the first unmanned test flight of the orion mpcv ) ; and about $ 145 million for commercial space transportation programs due to launch-related activities .\n', 'the increases were offset by lower net sales of approximately $ 335 million for government satellite programs due to decreased volume ( primarily aehf , gps-iii and muos ) ; and about $ 45 million for various other programs due to decreased volume .\nspace systems 2019 operating profit for 2014 was comparable to 2013 .\n', 'operating profit decreased by approximately $ 20 million for government satellite programs due to lower volume ( primarily aehf and g

In [6]:
#=====================
#4. llm loading
#=====================
model_id = "google/gemma-2b-it"
use_quantization_config = False
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")
print(f"[INFO] Using model_id: {model_id}")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)
Gamma_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id, 
                                                 torch_dtype=torch.float16,
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=False,
                                                 attn_implementation=attn_implementation) 
if not use_quantization_config:
    Gamma_model.to("cuda")

[INFO] Using attention implementation: flash_attention_2
[INFO] Using model_id: google/gemma-2b-it


You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [01:45<00:00, 52.90s/it]


In [7]:
# ====================
# 5. RAG Query
# ====================
def ask(query, 
        temperature=0.7,
        max_new_tokens=512,
        format_answer_text=True, 
        return_answer_only=True,
        task="qa"):
    
    # --- RETRIEVAL ---
# In ask() function, change:
    top_chunk, score = retrieval.query(query, top_k=1)[0]  # Get first result    
    # --- PROMPT FORMATTING ---
    prompt = f"""Answer the question based on the context below.
    
    Question: {query}
    Context: {top_chunk if isinstance(top_chunk, str) else ' '.join(top_chunk)}
    Answer:"""
    
    # --- GENERATION ---
    inputs = tokenizer(prompt, return_tensors="pt").to(Gamma_model.device)
    outputs = Gamma_model.generate(
        **inputs,
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    # --- RESPONSE CLEANING ---
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = full_response.replace(prompt, "").strip() if format_answer_text else full_response
    
    # --- RETURN ---
    return answer if return_answer_only else (answer, top_chunk, score)

In [8]:
# ====================
#6. Testing Function
# ====================
def test_rag_system(query: str, 
                   show_context: bool = True,
                   max_new_tokens: int = 256) -> str:

    print(f"\n{'='*50}")
    print(f"QUERY: {query}")
    print(f"{'='*50}")
    
    answer, context_chunk, score = ask(
        query=query,
        return_answer_only=False,
        max_new_tokens=max_new_tokens,
        task="qa"
    )
    
    print(f"\nANSWER:\n{answer}\n")
    
    if show_context:
        print(f"{'-'*50}")
        print(f"CONTEXT (Score: {score:.4f}):")
        print(f"{'-'*50}")
    
    return answer

In [9]:
# Test queries
test_queries = [
    "what is the net change in revenue from 2007 to 2008?",
]

for query in test_queries:
    test_rag_system(query)
    print("\n" + "="*100 + "\n")


QUERY: what is the net change in revenue from 2007 to 2008?

ANSWER:
21% decrease in net sales from 2008 to 2009.

--------------------------------------------------
CONTEXT (Score: 0.7002):
--------------------------------------------------


