In [1]:
# ====================
# 1. Setup & Imports
# ====================
import pandas as pd
import numpy as np
import spacy
import re
import torch
from tqdm.auto import tqdm
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("sentencizer")
device = "cuda" if torch.cuda.is_available() else "cpu"
embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ====================
# 2. Load & Preprocess Data
# ====================
def load_financial():
    df = pd.read_csv("finaicial.csv")
    return df
df = load_financial()

In [3]:
df.head()

Unnamed: 0,_id,text
0,dd4bff516,"containerboard , kraft papers and saturating k..."
1,dd4c55cc2,"entergy mississippi , inc .\nmanagement's fina..."
2,dd4c5a718,"we have a five year $ 1350 million revolving ,..."
3,dd4be0184,the agreements that govern the indebtedness in...
4,dd4b93b5e,"during 2005 , we amended our $ 1.0 billion uns..."


In [4]:
#df.to_csv("finaicial.csv", index=False)

In [5]:
# ====================
# 3. Table Extraction
# ====================
def extract_table(text):
    """Extract markdown-style tables from text into DataFrames."""
    table_lines = [line for line in text.split("\n") if "|" in line]
    if not table_lines:
        return None
    
    # Split rows and pad with empty strings for consistent columns
    table_data = [re.split(r"\s*\|\s*", line.strip()) for line in table_lines]
    max_cols = max(len(row) for row in table_data)
    table_data = [row + [""] * (max_cols - len(row)) for row in table_data]
    
    return pd.DataFrame(table_data[1:], columns=table_data[0])

df["tables"] = df["text"].apply(extract_table)

In [6]:
df.head()

Unnamed: 0,_id,text,tables
0,dd4bff516,"containerboard , kraft papers and saturating k...","( in millions ) year ended september 30 , ..."
1,dd4c55cc2,"entergy mississippi , inc .\nmanagement's fina...",( in millions ) 0 ------...
2,dd4c5a718,"we have a five year $ 1350 million revolving ,...",contractual oblig...
3,dd4be0184,the agreements that govern the indebtedness in...,sites corporate bd life sciences b...
4,dd4b93b5e,"during 2005 , we amended our $ 1.0 billion uns...",2006 $ 600883 0 ---------- -------...


In [7]:
# ====================
# 4. Sentence Chunking
# ====================
def split_into_chunks(text, chunk_size=10):
    """Split text into chunks of `chunk_size` sentences."""
    doc = nlp(text)
    sentences = [str(sent) for sent in doc.sents]
    return [sentences[i:i + chunk_size] for i in range(0, len(sentences), chunk_size)]

# Process text and tables
df["sentence_chunks"] = df["text"].apply(split_into_chunks)
df["tables_chunks"] = df["tables"].apply(
    lambda x: split_into_chunks(x.to_string()) if isinstance(x, pd.DataFrame) else []
)

In [8]:
df.head()

Unnamed: 0,_id,text,tables,sentence_chunks,tables_chunks
0,dd4bff516,"containerboard , kraft papers and saturating k...","( in millions ) year ended september 30 , ...","[[containerboard , kraft papers and saturating...",[[ ( in millions ) year ended september 30 ...
1,dd4c55cc2,"entergy mississippi , inc .\nmanagement's fina...",( in millions ) 0 ------...,"[[entergy mississippi , inc .\n, management's ...",[[ ( in millions )\n0 ---...
2,dd4c5a718,"we have a five year $ 1350 million revolving ,...",contractual oblig...,[[we have a five year $ 1350 million revolving...,[[ ...
3,dd4be0184,the agreements that govern the indebtedness in...,sites corporate bd life sciences b...,[[the agreements that govern the indebtedness ...,[[ sites corporate bd life sciences ...
4,dd4b93b5e,"during 2005 , we amended our $ 1.0 billion uns...",2006 $ 600883 0 ---------- -------...,"[[during 2005 , we amended our $ 1.0 billion u...",[[ 2006 $ 600883\n0 ---------- ----...


In [9]:
def counts(sents):
    return len(sents)
df["sentence_token_counts"] = df["sentence_chunks"].apply(counts)
df["table_token_counts"] = df["tables"].apply(counts)

In [10]:
df.head()

Unnamed: 0,_id,text,tables,sentence_chunks,tables_chunks,sentence_token_counts,table_token_counts
0,dd4bff516,"containerboard , kraft papers and saturating k...","( in millions ) year ended september 30 , ...","[[containerboard , kraft papers and saturating...",[[ ( in millions ) year ended september 30 ...,4,3
1,dd4c55cc2,"entergy mississippi , inc .\nmanagement's fina...",( in millions ) 0 ------...,"[[entergy mississippi , inc .\n, management's ...",[[ ( in millions )\n0 ---...,2,5
2,dd4c5a718,"we have a five year $ 1350 million revolving ,...",contractual oblig...,[[we have a five year $ 1350 million revolving...,[[ ...,5,9
3,dd4be0184,the agreements that govern the indebtedness in...,sites corporate bd life sciences b...,[[the agreements that govern the indebtedness ...,[[ sites corporate bd life sciences ...,3,5
4,dd4b93b5e,"during 2005 , we amended our $ 1.0 billion uns...",2006 $ 600883 0 ---------- -------...,"[[during 2005 , we amended our $ 1.0 billion u...",[[ 2006 $ 600883\n0 ---------- ----...,4,5


In [11]:
# ====================
# 5. Embedding Generation
# ====================
def generate_chunk_embeddings(chunks):
    """Generate embeddings for each chunk individually."""
    if not chunks or len(chunks) == 0:
        return []
    # Flatten chunks if they're nested lists (some chunks may contain multiple sentences)
    flat_chunks = [" ".join(chunk) if isinstance(chunk, list) else chunk for chunk in chunks]
    return embedding_model.encode(flat_chunks, convert_to_numpy=True)

# Generate embeddings correctly
df["text_embeddings"] = df["sentence_chunks"].apply(
    lambda chunks: generate_chunk_embeddings(chunks)

)
df["tables_embeddings"] = df["tables_chunks"].apply(
    lambda tables: generate_chunk_embeddings(tables)

)
# Verify shapes
print(f"Sample embeddings shape: {df['text_embeddings'].iloc[0][0].shape}")
print(f"Sample embeddings shape: {df['tables_embeddings'].iloc[0][0].shape}")

Sample embeddings shape: (768,)
Sample embeddings shape: (768,)


In [12]:
# Flatten chunks and embeddings
all_text_chunks = [chunk for doc_chunks in df["sentence_chunks"] for chunk in doc_chunks]
all_text_embeddings = [emb for doc_embs in df["text_embeddings"] for emb in doc_embs]
all_tables_embeddings = [tab for doc_tab in df["tables_embeddings"] for tab in doc_tab]
# # Check consistency
# assert len(all_text_chunks) == len(all_text_embeddings), "Mismatch between chunks and embeddings!"
# print(f"Total chunks: {len(all_text_chunks)}, Total embeddings: {len(all_text_embeddings)}")

In [None]:
# class VectorSearch:
#     def __init__(self, embeddings, texts):
#         # Stack embeddings into (N, 768) tensor
#         self.embeddings = torch.tensor(np.stack(embeddings), dtype=torch.float32).to(device)
#         self.texts = texts
    
#     def search(self, query, top_k=1):
#         query_embedding = embedding_model.encode(query, convert_to_tensor=True)
#         cos_scores = util.cos_sim(query_embedding, self.embeddings)[0]
#         top_indices = torch.topk(cos_scores, k=top_k).indices.cpu().numpy()
#         return [(self.texts[i], cos_scores[i].item()) for i in top_indices]

In [13]:
# ====================
# 6. Vector Search 
# ====================
class VectorSearch:
    def __init__(self, embeddings, texts):
        # Stack embeddings into (N, 768) tensor
        self.embeddings = torch.tensor(np.stack(embeddings), dtype=torch.float32).to(device)
        self.texts = texts
    
    def search(self, query, top_k=1):
        
        """Search for top_k most similar chunks (now defaults to top 1)"""
        query_embedding = embedding_model.encode(query, convert_to_tensor=True)
        cos_scores = util.cos_sim(query_embedding, self.embeddings)[0]
        top_indices = torch.topk(cos_scores, k=top_k).indices.cpu().numpy()
        return [(self.texts[i], cos_scores[i].item()) for i in top_indices] 

In [14]:
# Initialize search
text_searcher = VectorSearch(all_text_embeddings, all_text_chunks)

# Test query
query = "what was the increase in the operating profit for space systems from 2011 to 2012?"
results = text_searcher.search(query)

print(f"Results for '{query}':")
for i, (chunk, score) in enumerate(results):
    print(f"\nRank {i+1} (Score: {score:.4f}):")
    print(chunk if isinstance(chunk, str) else " ".join(chunk))  # Handle both str and list chunks

Results for 'what was the increase in the operating profit for space systems from 2011 to 2012?':

Rank 1 (Score: 0.7500):
| 13.0% ( 13.0 % )
backlog at year-end | $ 18900          | $ 20500          | $ 18100         

2014 compared to 2013 space systems 2019 net sales for 2014 increased $ 107 million , or 1% ( 1 % ) , compared to 2013 .
 the increase was primarily attributable to higher net sales of approximately $ 340 million for the orion program due to increased volume ( primarily the first unmanned test flight of the orion mpcv ) ; and about $ 145 million for commercial space transportation programs due to launch-related activities .
 the increases were offset by lower net sales of approximately $ 335 million for government satellite programs due to decreased volume ( primarily aehf , gps-iii and muos ) ; and about $ 45 million for various other programs due to decreased volume .
space systems 2019 operating profit for 2014 was comparable to 2013 .
 operating profit decreased by 

In [15]:
# ====================
# 7. Example Query
# ====================
query = "What was the net sales in 2019?"
results = text_searcher.search(query)

print(f"Top results for '{query}':")
for i, (chunk, score) in enumerate(results):
    print(f"\nRank {i + 1} (Score: {score:.4f}):")
    print(chunk)

Top results for 'What was the net sales in 2019?':

Rank 1 (Score: 0.6978):
['| --------------\nnet sales           | $ 7153           | $ 7579         | $ 7132        \noperating profit    | 905              | 737            | 645           \noperating margins   | 12.7% ( 12.7 % )', '| 9.7% ( 9.7 % )', '| 9.0% ( 9.0 % )\nbacklog at year-end | 10800            | 10700          | 10500         \n\n2013 compared to 2012 mst 2019s net sales for 2013 decreased $ 426 million , or 6% ( 6 % ) , compared to 2012 .\n', 'the decrease was primarily attributable to lower net sales of approximately $ 275 million for various ship and aviation systems programs due to lower volume']


In [None]:
# ====================
# 8. Save/Load System
# ====================
# Save embeddings and metadata
pd.DataFrame({
    "text": all_text_chunks,
    "embedding": [emb.tolist() for emb in all_text_embeddings]
}).to_parquet("financial_embeddings.parquet")

# Load for later use
# loaded_df = pd.read_parquet("financial_embeddings.parquet")
# loaded_embeddings = torch.tensor(np.stack(loaded_df["embedding"].values)).to(device)

In [16]:
#=====================
#10. llm loading
#=====================
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False 
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

Available GPU memory: 12 GB
GPU memory: 12 | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.
use_quantization_config set to: False
model_id set to: google/gemma-2b-it


In [17]:
#=====================
#10. llm setup
#=====================
# setup/libraries
from transformers import BitsAndBytesConfig
from transformers.utils import is_flash_attn_2_available
from sentence_transformers import SentenceTransformer, util
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

In [18]:
#=====================
#11. llm loading
#=====================
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")
print(f"[INFO] Using model_id: {model_id}")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)
Gamma_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id, 
                                                 torch_dtype=torch.float16,
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=False,
                                                 attn_implementation=attn_implementation) 
if not use_quantization_config:
    Gamma_model.to("cuda")

[INFO] Using attention implementation: flash_attention_2
[INFO] Using model_id: google/gemma-2b-it


You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [03:00<00:00, 90.30s/it]


In [19]:
# ====================
# 11. Unified RAG Query Function
# ====================
def ask(query, 
        temperature=1,
        max_new_tokens=512,
        format_answer_text=True, 
        return_answer_only=True,
        task="qa"):
    
    # --- RETRIEVAL ---
# In ask() function, change:
    top_chunk, score = text_searcher.search(query, top_k=1)[0]  # Get first result    
    # --- PROMPT FORMATTING ---
    prompt = f"""Answer the question based on the context below. Be concise.
    
    Question: {query}
    Context: {top_chunk if isinstance(top_chunk, str) else ' '.join(top_chunk)}
    Answer:"""
    
    # --- GENERATION ---
    inputs = tokenizer(prompt, return_tensors="pt").to(Gamma_model.device)
    outputs = Gamma_model.generate(
        **inputs,
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    # --- RESPONSE CLEANING ---
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = full_response.replace(prompt, "").strip() if format_answer_text else full_response
    
    # --- RETURN ---
    return answer if return_answer_only else (answer, top_chunk, score)

In [20]:
# ====================
# Testing Function 
# ====================
def test_rag_system(query: str, 
                   show_context: bool = True,
                   max_new_tokens: int = 256) -> str:

    print(f"\n{'='*50}")
    print(f"QUERY: {query}")
    print(f"{'='*50}")
    
    answer, context_chunk, score = ask(
        query=query,
        return_answer_only=False,
        max_new_tokens=max_new_tokens,
        task="qa"
    )
    
    print(f"\nANSWER:\n{answer}\n")
    
    if show_context:
        print(f"{'-'*50}")
        print(f"CONTEXT (Score: {score:.4f}):")
        #print(textwrap.fill(str(context_chunk)[:500], width=80))
        print(f"{'-'*50}")
        # Add this to your test_rag_system() before generation:
    
    return answer

In [21]:
# Initialize searcher (do this once)
text_searcher = VectorSearch(all_text_embeddings, all_text_chunks)

# Test queries
test_queries = [
    "what is the net change in estimated future net amortization expense of present value of future profits from 2013 to 2014?",
]

for query in test_queries:
    test_rag_system(query)
    print("\n" + "="*100 + "\n")


QUERY: what is the net change in estimated future net amortization expense of present value of future profits from 2013 to 2014?

ANSWER:
$ 24

Therefore, the net change in estimated future net amortization expense of present value of future profits from 2013 to 2014 is $ 24.

--------------------------------------------------
CONTEXT (Score: 0.6848):
--------------------------------------------------


