**# Install Librarys**

In [None]:
!pip install langchain chromadb pypdf ollama tiktoken sentence-transformers

Load & Process PDF

In [1]:
import os
import pypdf
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define folder where PDFs are stored
pdf_folder = r"C:\Users\LENOVO\Downloads\Danone_Annual Results"

# Function to extract text from PDFs
def extract_text_from_pdfs(folder_path):
    all_text = []
    
    for filename in sorted(os.listdir(folder_path)):  # Ensure order (2021, 2022, 2023)
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            print(f"Processing: {pdf_path}")
            
            pdf_reader = pypdf.PdfReader(pdf_path)
            text = "\n".join([page.extract_text() for page in pdf_reader.pages if page.extract_text()])
            all_text.append({"filename": filename, "text": text})
    
    return all_text

# Extract text from PDFs
documents = extract_text_from_pdfs(pdf_folder)

# Split text into chunks for embedding
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = []
for doc in documents:
    splits = text_splitter.split_text(doc["text"])
    for chunk in splits:
        chunks.append({"text": chunk, "source": doc["filename"]})

print(f"Total Chunks: {len(chunks)}")


Processing: C:\Users\LENOVO\Downloads\Danone_Annual Results\2021_Danone_test.pdf
Total Chunks: 1


 Embed & Store Data in ChromaDB

In [2]:
import chromadb
import shutil
from langchain_huggingface import HuggingFaceEmbeddings


#If the database is corrupted or locked, it can crash the kernel.
# Fix: Delete and reinitialize the database:
shutil.rmtree("./chroma_db", ignore_errors=True)  # Delete old DB


# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path="./chroma_db")  # Persistent storage
collection = chroma_client.get_or_create_collection(name="danone_annual_results_3")

# Load Sentence Transformer model for embeddings
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Store chunks in ChromaDB
for idx, chunk in enumerate(chunks):
    collection.add(
        ids=[str(idx)],
        documents=[chunk["text"]],
        metadatas=[{"source": chunk["source"]}]
    )

print("✅ Documents stored in ChromaDB!")


Insert of existing embedding ID: 0
Add of existing embedding ID: 0


✅ Documents stored in ChromaDB!


Retrival From Chroma


In [3]:
def query_chroma(query_text, top_k=3):
    results = collection.query(
        query_texts=[query_text],
        n_results=top_k
    )
    
    retrieved_texts = [doc for doc in results['documents'][0]]
    return "\n".join(retrieved_texts)

query_text = "What is Danone's sales revenue for the last 2 years?"
context = query_chroma(query_text)
print("Retrieved Context:\n", context)


Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


Retrieved Context:
 35 36
2020KPIs 2021
Net Sales(1)
LFL Sales Growth(1)
Free Cash-Flow(1)
Recurring Operating Margin(1)
Net debt/ EBITDA
ROIC
Dividend per share
E-commerce
€23.62 BN
-1.5%
€2.1 BN
14.00%
2.8x
8.5%
€1.94
+40 vs L Y (representing 
10% of total revenue)
+16% vs L Y (representing 
10% of total revenue)
€24.3 BN
+3.4%
€2.5 BN
13.74%
3x
8.7%
€1.94
OUR PERFORMANCE IN 2021


Use llama 3.2 to retrieve sales figure and put into PD dataframe for further integration in DB/PowerBI

In [5]:
import pandas as pd
import ollama

# ✅ Function to extract last 2 years' sales data automatically
def extract_sales_llm():
    # Query all stored documents
    results = collection.query(query_texts=["Company X annual report"], n_results=2)

    # Combine all retrieved text
    full_text = " ".join([doc[0] for doc in results["documents"]])

    # LLaMA 3.2 prompt to extract last 2 years' sales
    prompt = f"""
    From the following financial report text, identify the last two years of Net Sales data.
    Extract the years and corresponding sales figures in billions.

    Financial Report Text:
    {full_text}

    Return the data in this format:
    Year: 2020, Net Sales: 27.5
    Year: 2021, Net Sales: 25.8
    """


    # Query LLaMA 3.2
    response = ollama.chat(model="llama3.2", messages=[{"role": "user", "content": prompt}])
    extracted_text = response["message"]["content"].strip()

    # Parse extracted sales data
    sales_data = []
    for line in extracted_text.split("\n"):
        if "Year:" in line and "Net Sales:" in line:
            parts = line.split(", ")
            year = parts[0].split(": ")[1].strip()
            sales = float(parts[1].split(": ")[1].strip())
            sales_data.append({"Year": year, "Net Sales": sales})

    return sales_data

# ✅ Extract sales data dynamically
sales_results = extract_sales_llm()

# ✅ Convert to DataFrame
df_sales = pd.DataFrame(sales_results)

# ✅ Print DataFrame
print("Extracted Sales Data:\n", df_sales)

Number of requested results 2 is greater than number of elements in index 1, updating n_results = 1


Extracted Sales Data:
    Year  Net Sales
0  2020      23.62
1  2021      24.30
