In [26]:
# ====================
# 1. Setup & Imports
# ====================
import pandas as pd
import pinecone
import numpy as np
import spacy
import re
import torch
import os
from tqdm.auto import tqdm
load_dotenv()
from dotenv import load_dotenv
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("sentencizer")
device = "cuda" if torch.cuda.is_available() else "cpu"
embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)

In [2]:
# ====================
# 2. Load & Preprocess Data
# ====================
def load_financial():
    df = pd.read_csv("finaicial.csv")
    return df
df = load_financial()

In [3]:
df.head()

Unnamed: 0,_id,text
0,dd4bff516,"containerboard , kraft papers and saturating k..."
1,dd4c55cc2,"entergy mississippi , inc .\nmanagement's fina..."
2,dd4c5a718,"we have a five year $ 1350 million revolving ,..."
3,dd4be0184,the agreements that govern the indebtedness in...
4,dd4b93b5e,"during 2005 , we amended our $ 1.0 billion uns..."


In [4]:
# ====================
# 4. Sentence Chunking
# ====================
def split_into_chunks(text, chunk_size=10):
    """Split text into chunks of `chunk_size` sentences."""
    doc = nlp(text)
    sentences = [str(sent) for sent in doc.sents]
    return [sentences[i:i + chunk_size] for i in range(0, len(sentences), chunk_size)]

# Process text and tables
df["sentence_chunks"] = df["text"].apply(split_into_chunks)

In [5]:
# =====================
# 5. Tokens Counts
# =====================
#token limits for all-mpnet-base-2 is 384
def counts(text):
    """checking the tokens to see if its within the token limit"""
    return len(text)
df["sentence_token_counts"] = df["sentence_chunks"].apply(counts)

In [6]:
df.head()

Unnamed: 0,_id,text,sentence_chunks,sentence_token_counts
0,dd4bff516,"containerboard , kraft papers and saturating k...","[[containerboard , kraft papers and saturating...",4
1,dd4c55cc2,"entergy mississippi , inc .\nmanagement's fina...","[[entergy mississippi , inc .\n, management's ...",2
2,dd4c5a718,"we have a five year $ 1350 million revolving ,...",[[we have a five year $ 1350 million revolving...,5
3,dd4be0184,the agreements that govern the indebtedness in...,[[the agreements that govern the indebtedness ...,3
4,dd4b93b5e,"during 2005 , we amended our $ 1.0 billion uns...","[[during 2005 , we amended our $ 1.0 billion u...",4


In [7]:
df.describe()

Unnamed: 0,sentence_token_counts
count,2066.0
mean,2.910939
std,1.007385
min,1.0
25%,2.0
50%,3.0
75%,3.0
max,10.0


In [8]:
# ====================
# 6. Embedding Generation
# ====================
def generate_embeddings(chunks):
    """Generate emdeddings layers"""
    # Flatten chunks if they're nested lists
    flat_chunks = [" ".join(chunk) if isinstance(chunk, list) else chunk for chunk in chunks]
    return embedding_model.encode(flat_chunks, convert_to_numpy=True)

# Generate embeddings correctly
df["text_embeddings"] = df["sentence_chunks"].apply(
    lambda chunks: generate_embeddings(chunks)
)

# Verify shapes
print(f"Sample embeddings shape: {df['text_embeddings'].iloc[0][0].shape}")

Sample embeddings shape: (768,)


In [9]:
# Flatten chunks and embeddings
all_text_chunks = [chunk for doc_chunks in df["sentence_chunks"] for chunk in doc_chunks]
all_text_embeddings = [emb for doc_embs in df["text_embeddings"] for emb in doc_embs]

# Check consistency
assert len(all_text_chunks) == len(all_text_embeddings), "Mismatch between chunks and embeddings!"
print(f"Total chunks: {len(all_text_chunks)}, Total embeddings: {len(all_text_embeddings)}")

Total chunks: 6014, Total embeddings: 6014


In [10]:
# ====================
# 7. Vector Search 
# ====================
class VectorSearch:
    def __init__(self, embeddings, texts):
        # Stack embeddings into (N, 768) tensor
        self.embeddings = torch.tensor(np.stack(embeddings), dtype=torch.float32).to(device)
        self.texts = texts
    
    def search(self, query, top_k=1):
        
        """Search for top_k most similar chunks (now defaults to top 1)"""
        query_embedding = embedding_model.encode(query, convert_to_tensor=True)
        cos_scores = util.cos_sim(query_embedding, self.embeddings)[0]
        top_indices = torch.topk(cos_scores, k=top_k).indices.cpu().numpy()
        return [(self.texts[i], cos_scores[i].item()) for i in top_indices] 

In [11]:
# Initialize search
text_searcher = VectorSearch(all_text_embeddings, all_text_chunks)

In [12]:
# ====================
# 8. Example Query
# ====================
query = "what was the increase in the operating profit for space systems from 2011 to 2012?"
results = text_searcher.search(query)

print(f"Top results for '{query}':")
for i, (chunk, score) in enumerate(results):
    print(f"\nRank {i + 1} (Score: {score:.4f}):")
    print(chunk)

Top results for 'what was the increase in the operating profit for space systems from 2011 to 2012?':

Rank 1 (Score: 0.7500):
['| 13.0% ( 13.0 % )\nbacklog at year-end | $ 18900          | $ 20500          | $ 18100         \n\n2014 compared to 2013 space systems 2019 net sales for 2014 increased $ 107 million , or 1% ( 1 % ) , compared to 2013 .\n', 'the increase was primarily attributable to higher net sales of approximately $ 340 million for the orion program due to increased volume ( primarily the first unmanned test flight of the orion mpcv ) ; and about $ 145 million for commercial space transportation programs due to launch-related activities .\n', 'the increases were offset by lower net sales of approximately $ 335 million for government satellite programs due to decreased volume ( primarily aehf , gps-iii and muos ) ; and about $ 45 million for various other programs due to decreased volume .\nspace systems 2019 operating profit for 2014 was comparable to 2013 .\n', 'operatin

In [27]:
# ========================
# Save in vector database
# ========================
vector_db = os.getenv("PINECONE_API_KEY")
pc = pinecone.Pinecone(api_key="pcsk_7B7VXN_6M4qLKUbxBrU4iCXs5VVy4ZCQCoTJUNJayD2EJa6PeqGygBfxzBb64YL2D56C9U")
index_name = "datatonic-rags"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768, 
        metric="cosine",
        spec=pinecone.ServerlessSpec(
            cloud="aws",
            region="us-east-1" 
        )
    )

index = pc.Index(index_name)
print(f"Index '{index_name}' is ready!")

Index 'datatonic-rags' is ready!


In [16]:
#==================================
# 10. Push embeddings to Vector Database
#==================================
vectors_to_upsert = [
    (
        f"vec_{i}",  # Unique ID for each vector
        emb.tolist() if hasattr(emb, 'tolist') else emb,  # Ensure it's a list
        {"text": chunk}  # Store the text in metadata
    )
    for i, (chunk, emb) in enumerate(zip(all_text_chunks, all_text_embeddings))
]

# 2. Batch upsert (Pinecone recommends batches of 100-200)
batch_size = 100

for i in tqdm(range(0, len(vectors_to_upsert), batch_size)):
    # Get batch of vectors
    i_end = min(i+batch_size, len(vectors_to_upsert))
    batch = vectors_to_upsert[i:i_end]
    
    # Upsert to Pinecone
    try:
        index.upsert(vectors=batch)
    except Exception as e:
        print(f"Error upserting batch {i}-{i_end}: {e}")
        # Optionally: retry or save failed batches

print("Upsert complete!")

In [28]:
class PineconeRetriever:
    def __init__(self, index_name="datatonic-rags", embedding_model=None):
        vector_db = os.getenv("PINECONE_API_KEY")
        self.pc = pinecone.Pinecone(api_key=vector_db)
        self.index = self.pc.Index(index_name)
        self.embedding_model = embedding_model
    def query(self, query: str, top_k: int = 1):
        query_embedding = self.embedding_model.encode(query).tolist()
        results = self.index.query(
            vector=query_embedding,
            top_k=top_k,
            include_metadata=True
        )
        chunks_with_scores = [(match.metadata["text"], match.score) for match in results.matches]
        return chunks_with_scores

In [29]:
retrieval = PineconeRetriever(embedding_model = embedding_model)

In [31]:
user_query = "What is Tesla Stcok in year 2023"
relevant_chunks = retrieval.query(user_query)
print(relevant_chunks)

[(['meet customer needs and put us in a position to handle demand changes .\n', 'we will also continue utilizing industrial engineering techniques to improve productivity .\n', '2022 fuel prices 2013 uncertainty about the economy makes fuel price projections difficult , and we could see volatile fuel prices during the year , as they are sensitive to global and u.s .\ndomestic demand , refining capacity , geopolitical issues and events , weather conditions and other factors .\nto reduce the impact of fuel price on earnings , we will continue to seek recovery from our customers through our fuel surcharge programs and to expand our fuel conservation efforts .\n', '2022 capital plan 2013 in 2010 , we plan to make total capital investments of approximately $ 2.5 billion , including expenditures for ptc , which may be revised if business conditions or new laws or regulations affect our ability to generate sufficient returns on these investments .\n', 'see further discussion in this item 7 un