In [58]:
import os
import PyPDF2
import pandas as pd
import re
import openai
from pinecone import Pinecone, ServerlessSpec
import time
from dotenv import load_dotenv

# Load environment variables
print("Loading environment variables...")
load_dotenv()

Loading environment variables...


True

In [27]:
# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    print(f"Extracting text from PDF: {pdf_path}")
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text() or ''  # Handle potential None values
    print("Text extraction complete.")
    return text

In [28]:
# Function to preprocess P&L text into structured format
def preprocess_pnl_text(text):
    print("Preprocessing P&L text...")
    lines = text.split("\n")
    data = []
    pattern = re.compile(r"([\w\s]+)\s+(\d{1,3}(?:,\d{3})*\.\d{2})")  # Extract financial key-value pairs
    
    for line in lines:
        match = pattern.match(line.strip())
        if match:
            key, value = match.groups()
            data.append((key.strip(), float(value.replace(',', ''))))
    
    print("Preprocessing complete.")
    df = pd.DataFrame(data, columns=["Metric", "Value"])
    return df

In [29]:
# Load API keys securely from environment variables
print("Fetching API keys from environment variables...")
openai_api_key = os.getenv("OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")

if not openai_api_key or not pinecone_api_key:
    raise ValueError("API keys not found. Please set them as environment variables.")
print("API keys loaded successfully.")

Fetching API keys from environment variables...
API keys loaded successfully.


In [30]:
# Set OpenAI API key
openai.api_key = openai_api_key

In [31]:
# Initialize Pinecone client
print("Initializing Pinecone client...")
pc = Pinecone(api_key=pinecone_api_key)
print("Pinecone client initialized.")

Initializing Pinecone client...
Pinecone client initialized.


In [59]:
# Load embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

In [60]:
# Create Pinecone index with correct configuration
index_name = "financial-pnl-index"

In [61]:
# Check if index exists, if not, create one
print("Checking for existing Pinecone index...")
existing_indexes = pc.list_indexes().names()
if index_name not in existing_indexes:
    print(f"Index '{index_name}' not found. Creating a new one...")
    pc.create_index(
        name=index_name,
        dimension=384,  # dimension to match OpenAI embedding model
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    print("Index created. Waiting for initialization...")
    time.sleep(60)  # Allow time for index creation
else:
    print(f"Index '{index_name}' already exists.")

Checking for existing Pinecone index...
Index 'financial-pnl-index' already exists.


In [62]:
# Connect to the Pinecone index
print(f"Connecting to Pinecone index '{index_name}'...")
index = pc.Index(index_name)
print("Connected to Pinecone index.")

Connecting to Pinecone index 'financial-pnl-index'...
Connected to Pinecone index.


In [63]:
def embed_and_store_pnl_data(pnl_df):
    """Generate embeddings and store financial data in Pinecone."""
    print("Embedding and storing P&L data in Pinecone...")
    to_upsert = []
    for _, row in pnl_df.iterrows():
        text = f"{row['Metric']}: {row['Value']}"
        embedding = embedding_model.encode(text).tolist()
        to_upsert.append((str(row['Metric']), embedding, {"text": text}))
    
    if to_upsert:
        index.upsert(vectors=to_upsert)
        print(f"Stored {len(to_upsert)} entries in Pinecone.")

In [64]:
def query_pnl(question):
    """Query P&L data using RAG model."""
    print(f"Processing query: {question}")
    query_embedding = embedding_model.encode(question).tolist()
    results = index.query(vector=query_embedding, top_k=3, include_metadata=True)
    print("Query embedding generated. Searching in Pinecone...")
    
    if results and results.get("matches"):
        context = "\n".join([match["metadata"]["text"] for match in results["matches"]])
    else:
        context = "No relevant data found."

    prompt = f"Context:\n{context}\n\nQuestion:\n{question}\nAnswer:"
    
    # Using new API call
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a financial assistant."},
            {"role": "user", "content": prompt},
        ],
        max_tokens=100
    )
    
    return response.choices[0].message['content'].strip()

In [38]:
# Example usage
pdf_path = "Sample Financial Statement.pdf"
print("Starting PDF processing...")
text = extract_text_from_pdf(pdf_path)

print("Starting text preprocessing...")
pnl_df = preprocess_pnl_text(text)
print("Processed Data:\n", pnl_df)

Starting PDF processing...
Extracting text from PDF: Sample Financial Statement.pdf
Text extraction complete.
Starting text preprocessing...
Preprocessing P&L text...
Preprocessing complete.
Processed Data:
                                                Metric  Value
0                                   Lease liabilities   2.19
1                         Other financial liabilities   2.12
2                                   Lease liabilities   2.19
3                         Other financial liabilities   2.12
4                           Other current liabilities   2.13
5                                          Provisions   2.14
6                             Revenue from operations   2.16
7                           Employee benefit expenses   2.18
8                Cost of software packages and others   2.18
9                                      Other expenses   2.18
10                                        Current tax   2.15
11                                       Deferred tax   2.15

In [65]:
# Store data in Pinecone
print("Storing preprocessed data in Pinecone...")
embed_and_store_pnl_data(pnl_df)

Storing preprocessed data in Pinecone...
Embedding and storing P&L data in Pinecone...
Stored 19 entries in Pinecone.


In [None]:
# Example financial query
query = "What is the gross profit for Q3 2024?"
print("Querying P&L data...")
answer = query_pnl(query)
print("Answer:", answer)