In [1]:
# ====================
# 1. Setup & Imports
# ====================
import pandas as pd
import pinecone
import numpy as np
import spacy
import re
import torch
from tqdm.auto import tqdm
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("sentencizer")
device = "cuda" if torch.cuda.is_available() else "cpu"
embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# ====================
# 2. Load & Preprocess Data
# ====================
def load_financial():
    df = pd.read_csv("finaicial.csv")
    return df
df = load_financial()

In [None]:
df.head()

In [None]:
# ====================
# 4. Sentence Chunking
# ====================
def split_into_chunks(text, chunk_size=10):
    """Split text into chunks of `chunk_size` sentences."""
    doc = nlp(text)
    sentences = [str(sent) for sent in doc.sents]
    return [sentences[i:i + chunk_size] for i in range(0, len(sentences), chunk_size)]

# Process text and tables
df["sentence_chunks"] = df["text"].apply(split_into_chunks)

In [None]:
# =====================
# 5. Tokens Counts
# =====================
#token limits for all-mpnet-base-2 is 384
def counts(text):
    """checking the tokens to see if its within the token limit"""
    return len(text)
df["sentence_token_counts"] = df["sentence_chunks"].apply(counts)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
# ====================
# 6. Embedding Generation
# ====================
def generate_embeddings(chunks):
    """Generate emdeddings layers"""
    # Flatten chunks if they're nested lists
    flat_chunks = [" ".join(chunk) if isinstance(chunk, list) else chunk for chunk in chunks]
    return embedding_model.encode(flat_chunks, convert_to_numpy=True)

# Generate embeddings correctly
df["text_embeddings"] = df["sentence_chunks"].apply(
    lambda chunks: generate_embeddings(chunks)
)

# Verify shapes
print(f"Sample embeddings shape: {df['text_embeddings'].iloc[0][0].shape}")

In [None]:
# Flatten chunks and embeddings
all_text_chunks = [chunk for doc_chunks in df["sentence_chunks"] for chunk in doc_chunks]
all_text_embeddings = [emb for doc_embs in df["text_embeddings"] for emb in doc_embs]

# Check consistency
assert len(all_text_chunks) == len(all_text_embeddings), "Mismatch between chunks and embeddings!"
print(f"Total chunks: {len(all_text_chunks)}, Total embeddings: {len(all_text_embeddings)}")

In [None]:
# ====================
# 7. Vector Search 
# ====================
class VectorSearch:
    def __init__(self, embeddings, texts):
        # Stack embeddings into (N, 768) tensor
        self.embeddings = torch.tensor(np.stack(embeddings), dtype=torch.float32).to(device)
        self.texts = texts
    
    def search(self, query, top_k=1):
        
        """Search for top_k most similar chunks (now defaults to top 1)"""
        query_embedding = embedding_model.encode(query, convert_to_tensor=True)
        cos_scores = util.cos_sim(query_embedding, self.embeddings)[0]
        top_indices = torch.topk(cos_scores, k=top_k).indices.cpu().numpy()
        return [(self.texts[i], cos_scores[i].item()) for i in top_indices] 

In [None]:
# Initialize search
text_searcher = VectorSearch(all_text_embeddings, all_text_chunks)

In [None]:
# ====================
# 8. Example Query
# ====================
query = "what was the increase in the operating profit for space systems from 2011 to 2012?"
results = text_searcher.search(query)

print(f"Top results for '{query}':")
for i, (chunk, score) in enumerate(results):
    print(f"\nRank {i + 1} (Score: {score:.4f}):")
    print(chunk)

In [None]:
# ========================
# Save in vector database
# ========================
pc = pinecone.Pinecone(api_key="pcsk_7B7VXN_6M4qLKUbxBrU4iCXs5VVy4ZCQCoTJUNJayD2EJa6PeqGygBfxzBb64YL2D56C9U")
index_name = "datatonic-rags"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768, 
        metric="cosine",
        spec=pinecone.ServerlessSpec(
            cloud="aws",
            region="us-east-1" 
        )
    )

index = pc.Index(index_name)
print(f"Index '{index_name}' is ready!")

In [None]:
# batch_size = 100
# for i in range(0, len(all_text_chunks), batch_size):
#     batch_ids = [str(j) for j in range(i, min(i+batch_size, len(all_text_chunks)))]
#     batch_vectors = [emb.tolist() for emb in all_text_embeddings[i:i+batch_size]]
#     batch_metadata = [{"text": text} for text in all_text_chunks[i:i+batch_size]]
    
#     index.upsert(
#         vectors=zip(batch_ids, batch_vectors, batch_metadata)
#     )
# print("Upsert Complete")

In [None]:
vectors_to_upsert = [
    (
        f"vec_{i}",  # Unique ID for each vector
        emb.tolist() if hasattr(emb, 'tolist') else emb,  # Ensure it's a list
        {"text": chunk}  # Store the text in metadata
    )
    for i, (chunk, emb) in enumerate(zip(all_text_chunks, all_text_embeddings))
]

# 2. Batch upsert (Pinecone recommends batches of 100-200)
batch_size = 100

for i in tqdm(range(0, len(vectors_to_upsert), batch_size)):
    # Get batch of vectors
    i_end = min(i+batch_size, len(vectors_to_upsert))
    batch = vectors_to_upsert[i:i_end]
    
    # Upsert to Pinecone
    try:
        index.upsert(vectors=batch)
    except Exception as e:
        print(f"Error upserting batch {i}-{i_end}: {e}")
        # Optionally: retry or save failed batches

print("Upsert complete!")

In [None]:
print(index.describe_index_stats())
sample_query = [0.1] * 768 
results = index.query(vector=sample_query, top_k=1, include_metadata=True)
print(results)

In [None]:
def query_pinecone(query: str, top_k: int = 1):
    query_embedding = embedding_model.encode(query).tolist()
    results = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True
    )
    chunks = [match.metadata["text"] for match in results.matches]
    return chunks
user_query = "what was the increase in the operating profit for space systems from 2011 to 2012?"
relevant_chunks = query_pinecone(user_query)
print(relevant_chunks)

In [None]:
# ====================
# 8. Save/Load System
# ====================
# # Save embeddings and metadata
# pd.DataFrame({
#     "text": all_text_chunks,
#     "embedding": [emb.tolist() for emb in all_text_embeddings]
# }).to_parquet("financial_embeddings.parquet")
        #chunks = [match.metadata["text"] for match in results.matches]


In [45]:
class PineconeRetriever:
    def __init__(self, index_name="datatonic-rags", embedding_model=None):
        self.pc = Pinecone(api_key="pcsk_7B7VXN_6M4qLKUbxBrU4iCXs5VVy4ZCQCoTJUNJayD2EJa6PeqGygBfxzBb64YL2D56C9U")
        self.index = self.pc.Index(index_name)
        self.embedding_model = embedding_model
    def query(self, query: str, top_k: int = 1):
        query_embedding = self.embedding_model.encode(query).tolist()
        results = self.index.query(
            vector=query_embedding,
            top_k=top_k,
            include_metadata=True
        )
        chunks_with_scores = [(match.metadata["text"], match.score) for match in results.matches]
        return chunks_with_scores
retrieval = PineconeRetriever(embedding_model = embedding_model)

In [46]:
user_query = "what was the increase in the operating profit for space systems from 2011 to 2012?"
relevant_chunks = retrieval.query(user_query)
print(relevant_chunks)

[(['| 13.0% ( 13.0 % )\nbacklog at year-end | $ 18900          | $ 20500          | $ 18100         \n\n2014 compared to 2013 space systems 2019 net sales for 2014 increased $ 107 million , or 1% ( 1 % ) , compared to 2013 .\n', 'the increase was primarily attributable to higher net sales of approximately $ 340 million for the orion program due to increased volume ( primarily the first unmanned test flight of the orion mpcv ) ; and about $ 145 million for commercial space transportation programs due to launch-related activities .\n', 'the increases were offset by lower net sales of approximately $ 335 million for government satellite programs due to decreased volume ( primarily aehf , gps-iii and muos ) ; and about $ 45 million for various other programs due to decreased volume .\nspace systems 2019 operating profit for 2014 was comparable to 2013 .\n', 'operating profit decreased by approximately $ 20 million for government satellite programs due to lower volume ( primarily aehf and g

In [25]:
#=====================
#10. llm loading
#=====================
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False 
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

Available GPU memory: 12 GB
GPU memory: 12 | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.
use_quantization_config set to: False
model_id set to: google/gemma-2b-it


In [26]:
#=====================
#10. llm setup
#=====================
# setup/libraries
from transformers import BitsAndBytesConfig
from transformers.utils import is_flash_attn_2_available
from sentence_transformers import SentenceTransformer, util
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

In [27]:
#=====================
#11. llm loading
#=====================
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")
print(f"[INFO] Using model_id: {model_id}")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)
Gamma_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id, 
                                                 torch_dtype=torch.float16,
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=False,
                                                 attn_implementation=attn_implementation) 
if not use_quantization_config:
    Gamma_model.to("cuda")

[INFO] Using attention implementation: flash_attention_2
[INFO] Using model_id: google/gemma-2b-it


You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:43<00:00, 81.75s/it]


In [60]:
# ====================
# 11. Unified RAG Query Function (Updated)
# ====================
def ask(query, 
        temperature=0.7,
        max_new_tokens=512,
        format_answer_text=True, 
        return_answer_only=True,
        task="qa"):
    
    # --- RETRIEVAL ---
# In ask() function, change:
    top_chunk, score = retrieval.query(query, top_k=1)[0]  # Get first result    
    # --- PROMPT FORMATTING ---
    prompt = f"""Answer the question based on the context below.
    
    Question: {query}
    Context: {top_chunk if isinstance(top_chunk, str) else ' '.join(top_chunk)}
    Answer:"""
    
    # --- GENERATION ---
    inputs = tokenizer(prompt, return_tensors="pt").to(Gamma_model.device)
    outputs = Gamma_model.generate(
        **inputs,
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    # --- RESPONSE CLEANING ---
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = full_response.replace(prompt, "").strip() if format_answer_text else full_response
    
    # --- RETURN ---
    return answer if return_answer_only else (answer, top_chunk, score)


In [61]:
# ====================
# Testing Function (Updated)
# ====================
def test_rag_system(query: str, 
                   show_context: bool = True,
                   max_new_tokens: int = 256) -> str:

    print(f"\n{'='*50}")
    print(f"QUERY: {query}")
    print(f"{'='*50}")
    
    answer, context_chunk, score = ask(
        query=query,
        return_answer_only=False,
        max_new_tokens=max_new_tokens,
        task="qa"
    )
    
    print(f"\nANSWER:\n{answer}\n")
    
    if show_context:
        print(f"{'-'*50}")
        print(f"CONTEXT (Score: {score:.4f}):")
        #print(textwrap.fill(str(context_chunk)[:500], width=80))
        print(f"{'-'*50}")
        # Add this to your test_rag_system() before generation:
    
    return answer

In [65]:
# Initialize searcher (do this once)

# Test queries
test_queries = [
    "what is the total of estimated future contingent acquisition obligations payable in cash in 2009?",
]

for query in test_queries:
    test_rag_system(query)
    print("\n" + "="*100 + "\n")


QUERY: what is the total of estimated future contingent acquisition obligations payable in cash in 2009?

ANSWER:
$ 5.5  |  |  |  |  |  |  | 5.5   
 **Explanation:**  The total of estimated future contingent acquisition obligations payable in cash in 2009 is $5.5 million.

--------------------------------------------------
CONTEXT (Score: 0.7406):
--------------------------------------------------


