# ---------------------------
# 1. Process PDF

In [24]:
import os
import pymupdf4llm
from langchain_text_splitters import RecursiveCharacterTextSplitter

Consider using the pymupdf_layout package for a greatly improved page layout analysis.


In [25]:
pdf_folder = "/Users/nguyentoan/Documents/AI_Financial_RAG/Data/SamSung_data"
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

In [26]:
print("S·ªë file PDF:", len(pdf_files))
print(pdf_files)

S·ªë file PDF: 7
['2023_con_quarter04_all.pdf', '2025_3Q_Interim_Report.pdf', '2025_con_quarter03_all.pdf', '2024_con_quarter04_all.pdf', '2023_con_quarter04_note.pdf', '2025_con_quarter03_note.pdf', '2024_con_quarter04_note.pdf']


In [None]:
# Initialize empty lists (Important!)
all_chunks = []
all_ids = []
all_metadatas = []

# Initialize text splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150,
    separators=["\n##", "\n#", "\n\n", "\n", " "]
)

for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_folder, pdf_file)
    print(f"ƒêang x·ª≠ l√Ω: {pdf_file}...")

    # 1. Convert PDF to Markdown
    md_text = pymupdf4llm.to_markdown(pdf_path)

    # 2. Split text into chunks
    chunks = splitter.split_text(md_text)

    # 3. Append chunks, metadata, and IDs to the main lists
    for i, chunk in enumerate(chunks):
        all_chunks.append(chunk)
        all_metadatas.append({"source": pdf_file, "chunk_id": i})
        all_ids.append(f"{pdf_file}_{i}")

print(f"--- Ho√†n th√†nh! T·ªïng c·ªông c√≥ {len(all_chunks)} chunks t·ª´ {len(pdf_files)} file. ---")

ƒêang x·ª≠ l√Ω: 2023_con_quarter04_all.pdf...
ƒêang x·ª≠ l√Ω: 2025_3Q_Interim_Report.pdf...
ƒêang x·ª≠ l√Ω: 2025_con_quarter03_all.pdf...
ƒêang x·ª≠ l√Ω: 2024_con_quarter04_all.pdf...
ƒêang x·ª≠ l√Ω: 2023_con_quarter04_note.pdf...
ƒêang x·ª≠ l√Ω: 2025_con_quarter03_note.pdf...
ƒêang x·ª≠ l√Ω: 2024_con_quarter04_note.pdf...
--- Ho√†n th√†nh! T·ªïng c·ªông c√≥ 2681 chunks t·ª´ 7 file. ---


# 2. Initialize ChromaDB
# ---------------------------

In [28]:
import chromadb
from chromadb.config import Settings
# Fix: Use chromadb.PersistentClient instead of the deprecated Settings configuration
client = chromadb.PersistentClient(path="/Users/nguyentoan/Documents/" \
                                            "AI_Financial_RAG/Data/chroma_samsung_db")

collection_name = "samsung_financials"
if collection_name in [c.name for c in client.list_collections()]:
    collection = client.get_or_create_collection(collection_name)
else:
    collection = client.create_collection(collection_name)

In [5]:
# import sys

# # 1. Force install the specific compatible versions into the current environment
# !{sys.executable} -m pip install "huggingface-hub<1.0.0" "sentence-transformers==3.0.1" "langchain-huggingface" --force-reinstall

# # 2. Restart the kernel (you must do this manually after the cell finishes!)

In [29]:
from langchain_huggingface import HuggingFaceEmbeddings
embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
#If the collection already exists ‚Üí delete it to start over.
if collection_name in [c.name for c in client.list_collections()]:
    client.delete_collection(collection_name)

# Add Files 

In [None]:
from langchain_chroma import Chroma
#  Chroma wrapper
vectorstore = Chroma(
    client=client,                     
    collection_name= collection_name,
    embedding_function=embed_model
)
# Sync 3 list: chunks, metadatas, ids
if len(all_metadatas) != len(all_chunks):
    all_metadatas = [{"source": "PDF_file"} for _ in all_chunks]

# Generate unique IDs for each chunk (if not already generated)
all_ids = [f"chunk_{i}" for i in range(len(all_chunks))]

#   Add texts to ChromaDB with metadata and IDs
vectorstore.add_texts(
    texts=all_chunks,
    metadatas=all_metadatas,
    ids=all_ids
)

print(f"ƒê√£ th√™m {len(all_chunks)} chunk PDF v√†o ChromaDB (qua LangChain wrapper)")

ƒê√£ th√™m 2681 chunk PDF v√†o ChromaDB (qua LangChain wrapper)


In [12]:
# import sys
# !{sys.executable} -m pip install langchain-chroma

# Add File Json 

In [127]:
import json

with open("/Users/nguyentoan/Documents/AI_Financial_RAG/Data/samsung_special_events.json", "r") as f:
    events = json.load(f)

event_docs = []
event_metadatas = []
event_ids = []

for i, e in enumerate(events):
    event_date = e['Date'].split("T")[0]

    text = (
        f"Date: {event_date}. "
        f"Open: {e['Open']:.2f}, High: {e['High']:.2f}, "
        f"Low: {e['Low']:.2f}, Close: {e['Close']:.2f}, "
        f"Volume: {e['Volume']}."
    )

    event_docs.append(text)
    event_metadatas.append({
        "source": "samsung_special_events.json",
        "type": "event",
        "date": event_date
    })
    event_ids.append(f"event_{i}")

# üî• ADD QUA LANGCHAIN WRAPPER
vectorstore.add_texts(
    texts=event_docs,
    metadatas=event_metadatas,
    ids=event_ids
)

print(f" Added {len(event_docs)} events into ChromaDB (chunk collection)")


 Added 60 events into ChromaDB (chunk collection)


In [126]:
import json

with open("/Users/nguyentoan/Documents/AI_Financial_RAG/Data/hist_data_cleaned.json", "r") as f:
    hist_data = json.load(f)

stock_docs = []
stock_metadatas = []
stock_ids = []

for i, d in enumerate(hist_data):
    text = (
        f"Ng√†y {d['Date']}: "
        f"Gi√° m·ªü c·ª≠a {d['Open']}, "
        f"Gi√° cao nh·∫•t {d['High']}, "
        f"Gi√° th·∫•p nh·∫•t {d['Low']}, "
        f"Gi√° ƒë√≥ng c·ª≠a {d['Close']}, "
        f"Kh·ªëi l∆∞·ª£ng giao d·ªãch {d['Volume']}"
    )

    stock_docs.append(text)
    stock_metadatas.append({
        "source": "hist_data_cleaned.json",
        "type": "daily_price",
        "date": d["Date"]
    })
    stock_ids.append(f"stock_{i}")

print(f"Total daily records: {len(stock_docs)}")



Total daily records: 6530


In [103]:
BATCH_SIZE = 5000  # < 5461

for i in range(0, len(stock_docs), BATCH_SIZE):
    batch_texts = stock_docs[i:i + BATCH_SIZE]
    batch_metadatas = stock_metadatas[i:i + BATCH_SIZE]
    batch_ids = stock_ids[i:i + BATCH_SIZE]

    vectorstore.add_texts(
        texts=batch_texts,
        metadatas=batch_metadatas,
        ids=batch_ids
    )

    print(f"‚úÖ Added batch {i//BATCH_SIZE + 1}: {len(batch_texts)} documents")


‚úÖ Added batch 1: 5000 documents
‚úÖ Added batch 2: 1530 documents


In [None]:
# document in collection
collection = client.get_or_create_collection(collection_name)
num_docs = collection.count()
print(f"T·ªïng s·ªë document trong collection '{collection_name}': {num_docs}")

T·ªïng s·ªë document trong collection 'samsung_financials': 9271


In [None]:
# 2. Get the vector collection from vectorstore
# Note: _collection is an internal attribute and may change by version
collection = vectorstore._collection

# 3. Get 1 embedding from database
# include=["embeddings"] to only get the embeddings part, limit=1 to only get 1 vector
sample_data = collection.get(limit=1, include=["embeddings"])
sample_embedding = sample_data["embeddings"][0]

# 4. Check the dimensionality of the embedding
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

The vectors have 384 dimensions


In [None]:
import os
from dotenv import load_dotenv
from langchain_groq import ChatGroq

# 1. T·∫£i c√°c bi·∫øn t·ª´ file .env v√†o m√¥i tr∆∞·ªùng
load_dotenv()

# 2. L·∫•y key t·ª´ m√¥i tr∆∞·ªùng (kh√¥ng bao gi·ªù vi·∫øt tr·ª±c ti·∫øp v√†o code)
# Bi·∫øn groq_api_key ph·∫£i tr√πng v·ªõi t√™n b·∫°n ƒë·∫∑t trong file .env
api_key = os.getenv("groq_api_key")

# 3. Kh·ªüi t·∫°o LLM
llm = ChatGroq(
    temperature=0, 
    model_name="llama-3.3-70b-versatile", 
    groq_api_key=api_key
)

In [None]:
#  retriever from vectorstore (Chroma)
retriever = vectorstore.as_retriever(search_kwargs={"k": 6})

In [96]:
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Function to format documents (Ensure docs is a list of Document objects)
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Prompt
template = """You are a professional financial analysis assistant.
Based on the provided Context, answer the question in detail.

Context:
{context}

Question: {question}

Requirements:
- If the information exists in the context, list it completely.
- Answer in professional English.
- If the information is not available, say 'I could not find specific data for this request.'

Answer:"""

prompt = ChatPromptTemplate.from_template(template)

# RAG Chain using LCEL (Flexible structure)
rag_chain = (
    {
        # Extract question from input -> pass to retriever -> format retrieved documents
        "context": (lambda x: x["question"]) | retriever | format_docs_runnable,
        "question": lambda x: x["question"]
    }
    | prompt
    | llm
    | StrOutputParser()
)

In [98]:
# 7. Test run
query = "What core products is Samsung currently focusing on developing? And provide specific examples, including which file the information is found in."

try:
    response = rag_chain.invoke({"question": query})
    print("--- RESULT ---")
    print(response)

except Exception as e:
    print(f"Error while running the chain: {e}")

--- RESULT ---
Based on the provided context, Samsung Electronics Co., Ltd. and its subsidiaries appear to be focusing on developing a wide range of products, including electronic devices, network solutions, software, and TVs. The specific examples of products and services can be found in the "NOTES TO THE CONSOLIDATED FINANCIAL STATEMENTS" section, which lists the various subsidiaries and their respective areas of focus.

Some specific examples of products and services that Samsung is currently developing include:

1. Electronic devices, as mentioned in the context of Samsung Electronics Rus Company LLC (SERC), Samsung Electronics Ukraine Company LLC (SEUC), and Samsung Electronics Central Eurasia LLP (SECE).
2. Network solutions, as mentioned in the context of SAMSUNG Zhilabs, S.L.
3. Software, as mentioned in the context of Sonio SAS.
4. TVs, as mentioned in the context of Samsung Electronics Rus Kaluga LLC (SERK).
5. R&D activities, as mentioned in the context of Samsung Nanoradio 

In [83]:
import time

def test_query_performance():
    """Test query with timing"""
    query = "What core products is Samsung currently focusing on developing?"
    
    start_time = time.time()
    
    # N·∫øu chain d√πng StrOutputParser() cu·ªëi c√πng, k·∫øt qu·∫£ l√† string
    result = rag_chain.invoke({"question": query})
    
    end_time = time.time()
    
    print(f"Query processed in {end_time - start_time:.2f} seconds")
    print("Answer:", result)  # result l√† string
    
    # N·∫øu mu·ªën d√πng source_documents, c·∫ßn chain tr·∫£ v·ªÅ dict
    if hasattr(rag_chain, "source_documents") and rag_chain.source_documents:
        print(f"Used {len(rag_chain.source_documents)} source documents")


In [111]:

test_query_performance()

Query processed in 5.11 seconds
Answer: Based on the provided context, Samsung Electronics is currently focusing on developing a wide range of electronic devices, including:

1. Smartphones: As the smartphone market shows high saturation, Samsung is emphasizing the importance of competitiveness in the overall experience based on software for applications, UX, games, media, digital wallets, AI, security, etc.
2. Home appliances: Suzhou Samsung Electronics Co., Ltd. (SSEC) and Samsung Suzhou Electronics Export Co., Ltd. (SSEC-E) are involved in the manufacture of home appliances, with a focus on products such as air conditioning units, as seen in Samsung Electronics Air Conditioner Europe B.V. (SEACE).
3. Communication equipment: Tianjin Samsung Telecom Technology Co., Ltd. (TSTC) is engaged in the manufacture of communication equipment, highlighting Samsung's commitment to developing innovative communication solutions.
4. TVs: Samsung Electronics Rus Kaluga LLC (SERK) is involved in the

# Ranker

In [None]:
query = "What core products is Samsung currently focusing on developing?"
docs = retriever.invoke(query)
from sentence_transformers import CrossEncoder

# Load reranker model
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
pairs = [(query, doc.page_content) for doc in docs]
scores = reranker.predict(pairs)
doc_scores = list(zip(docs, scores))

reranked_docs = sorted(doc_scores, key=lambda x: x[1], reverse=True)

print("\n===== AFTER RERANK =====")
for i, (doc, score) in enumerate(reranked_docs[:5]):
    print(f"\nRank {i+1} | Score: {score:.4f}")
    print("Source:", doc.metadata.get("source"))
    print("Preview:", doc.page_content[:200])


===== AFTER RERANK =====

Rank 1 | Score: -3.1682
Source: 2025_3Q_Interim_Report.pdf
Preview: Samsung Nanoradio Design Center (SNDC) R&D 100.0


Samsung Denmark Research Center ApS (SDRC) R&D 100.0


Samsung Cambridge Solution Centre Limited (SCSC) R&D 100.0


SAMSUNG Zhilabs, S.L. Development

Rank 2 | Score: -3.4110
Source: 2025_con_quarter03_all.pdf
Preview: Samsung Electronics Czech and Slovak s.r.o. (SECZ) Sale of electronic devices 100.0

Samsung Electronics Baltics SIA (SEB) Sale of electronic devices 100.0

Samsung Electronics Greece S.M.S.A (SEGR) S

Rank 3 | Score: -3.4110
Source: 2025_con_quarter03_note.pdf
Preview: Samsung Electronics Czech and Slovak s.r.o. (SECZ) Sale of electronic devices 100.0

Samsung Electronics Baltics SIA (SEB) Sale of electronic devices 100.0

Samsung Electronics Greece S.M.S.A (SEGR) S

Rank 4 | Score: -3.8782
Source: 2025_3Q_Interim_Report.pdf
Preview: The smartphone industry has grown significantly since 2007. In 2025, the smartphone portion of

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_groq import ChatGroq

# take top 3 after rerank
top_docs = [doc for doc, score in reranked_docs[:3]]

context = "\n\n".join([doc.page_content for doc in top_docs])

template = """You are a professional financial analysis assistant.

Context:
{context}

Question: {question}

Answer:"""

prompt = ChatPromptTemplate.from_template(template)

llm = ChatGroq(
    temperature=0,
    model_name="llama-3.3-70b-versatile",
    groq_api_key=api_key
)

chain = prompt | llm | StrOutputParser()

answer = chain.invoke({
    "context": context,
    "question": query
})

print("\n===== FINAL ANSWER =====")
print(answer)


===== FINAL ANSWER =====
Based on the provided list of Samsung's subsidiaries and their activities, it appears that Samsung is currently focusing on developing a wide range of products, including:

1. Electronic devices: With multiple subsidiaries such as Samsung Electronics Rus Company LLC, Samsung Electronics Ukraine Company LLC, Samsung Electronics Central Eurasia LLP, Samsung Electronics Czech and Slovak s.r.o., Samsung Electronics Baltics SIA, and Samsung Electronics Greece S.M.S.A, all involved in the sale of electronic devices, it's clear that Samsung is heavily invested in this area.

2. R&D: Many subsidiaries, including Samsung Nanoradio Design Center, Samsung Denmark Research Center ApS, Samsung Cambridge Solution Centre Limited, FOODIENT LTD, Oxford Semantic Technologies Limited, and Samsung R&D Institute Ukraine, are focused on research and development, indicating a strong emphasis on innovation and technological advancement.

3. Network solutions: SAMSUNG Zhilabs, S.L. is

# RAGAS Evaluation 

In [None]:

# RAGAS Evaluation 

import nest_asyncio
import pandas as pd
from datasets import Dataset

from ragas import evaluate
from ragas.metrics import Faithfulness, AnswerRelevancy, ContextPrecision
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

# Fix async loop issue in Jupyter
nest_asyncio.apply()


def build_ragas_wrappers(llm, embedding_model):
    """
    Wrap LangChain models so RAGAS can use them.
    """
    ragas_llm = LangchainLLMWrapper(llm)
    ragas_emb = LangchainEmbeddingsWrapper(embedding_model)
    return ragas_llm, ragas_emb

# Build Metrics

def build_metrics(ragas_llm, ragas_emb):
    """
    Initialize evaluation metrics (new class-based API).
    """
    faithfulness = Faithfulness(llm=ragas_llm)
    answer_relevancy = AnswerRelevancy(
        llm=ragas_llm,
        embeddings=ragas_emb
    )
    context_precision = ContextPrecision(llm=ragas_llm)

    return [faithfulness, answer_relevancy, context_precision]

# Create Evaluation Dataset

def build_dataset(question, contexts, answer, reference):
    """
    contexts MUST be list of list.
    """
    data = {
        "question": [question],
        "contexts": [contexts],  # list of list
        "answer": [answer],
        "reference": [reference],
    }
    return Dataset.from_dict(data)
# Run Evaluation

def run_evaluation(dataset, metrics):
    """
    Run RAGAS evaluation.
    """
    score = evaluate(
        dataset=dataset,
        metrics=metrics
    )
    return score.to_pandas()


# ---- Build wrappers
ragas_llm, ragas_emb = build_ragas_wrappers(llm, embed_model)

# ---- Build metrics
metrics = build_metrics(ragas_llm, ragas_emb)

# ---- Sample test case
question = "Total Revenue for the nine-month period ended September 30, 2024 are ?"

contexts = [
    "For the nine-month period ended September 30, 2024, total revenue was 225,082,634 million KRW."
]

answer = "The Total Revenue is 225,082,634 million KRW."
reference = "225,082,634 million KRW"

# ---- Build dataset
dataset = build_dataset(
    question=question,
    contexts=contexts,
    answer=answer,
    reference=reference
)

# ---- Evaluate
df_result = run_evaluation(dataset, metrics)

print("Available columns:", df_result.columns.tolist())

display_cols = [
    col for col in [
        "question",
        "faithfulness",
        "answer_relevancy",
        "context_precision"
    ]
    if col in df_result.columns
]

print(df_result[display_cols])


  from ragas.metrics import Faithfulness, AnswerRelevancy, ContextPrecision
  from ragas.metrics import Faithfulness, AnswerRelevancy, ContextPrecision
  from ragas.metrics import Faithfulness, AnswerRelevancy, ContextPrecision
  ragas_llm = LangchainLLMWrapper(llm)
  ragas_emb = LangchainEmbeddingsWrapper(embedding_model)
Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:05<00:00,  1.92s/it]


Available columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy', 'context_precision']
   faithfulness  answer_relevancy  context_precision
0           1.0          0.586604                1.0
