In [3]:
#working directory
%pwd

'd:\\DS\\Chat_bot\\Finance_Chatbot\\research'

In [4]:
import os
os.chdir("../") #Go back one directory back

In [5]:
%pwd

'd:\\DS\\Chat_bot\\Finance_Chatbot'

In [26]:
# import libraries
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [12]:
# Extract text from pdf files
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="./*pdf",
        loader_cls=PyPDFLoader
    )
    documents = loader.load()
    return documents


In [13]:
extracted_data = load_pdf_files("data")

In [14]:
extracted_data

[Document(metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2025-01-30T06:01:44-05:00', 'title': '0001326801-25-000017', 'author': 'EDGAR® Online LLC, a subsidiary of OTC Markets Group', 'subject': 'Form 10-K filed on 2025-01-30 for the period ending 2024-12-31', 'keywords': '0001326801-25-000017; ; 10-K', 'moddate': '2025-01-30T06:01:56-05:00', 'source': 'data\\NASDAQ_META_2024.pdf', 'total_pages': 150, 'page': 0, 'page_label': '1'}, page_content='UNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\n__________________________\nFORM 10-K\n__________________________\n(Mark One)\n☒     ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the fiscal year ended December 31, 2024or\n☐     TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the transition period from            to            Commission File Number: 001-3555

In [15]:
len(extracted_data)

150

In [16]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source'in metadata and original content
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source":src}
            )
        )
    return minimal_docs

In [17]:
filtered_docs = filter_to_minimal_docs(extracted_data)

In [31]:
#split the docs into smaller chuck
def text_split(filtered_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 200
    )
    texts = text_splitter.split_documents(filtered_docs)
    return texts



In [32]:
text_chunks = text_split(filtered_docs)
text_chunks

[Document(metadata={'source': 'data\\NASDAQ_META_2024.pdf'}, page_content="UNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\n__________________________\nFORM 10-K\n__________________________\n(Mark One)\n☒     ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the fiscal year ended December 31, 2024or\n☐     TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the transition period from            to            Commission File Number: 001-35551__________________________\nMeta Platforms, Inc.\n(Exact name of registrant as specified in its charter)__________________________\nDelaware 20-1665019\n(State or other jurisdiction of incorporation or organization) (I.R.S. Employer Identification Number)\n1 Meta Way, Menlo Park, California 94025\n(Address of principal executive offices and Zip Code)\n(650) 543-4800(Registrant's telephone number, including area code)\n_________________________

In [33]:
len(text_chunks)

741

In [34]:
#Embeddings
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    """
    Donwload and return the HuggingFace embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings


In [35]:
embeddings = download_embeddings()

  embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


In [41]:
vector = embeddings.embed_query("Hello")
print(f"Vector Length: {len(vector)}")

Vector Length: 384


In [90]:
from dotenv import load_dotenv
load_dotenv()

True

In [91]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [44]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(
    api_key=pinecone_api_key
)

In [45]:
pc

<pinecone.pinecone.Pinecone at 0x17d20866660>

In [52]:
from pinecone import ServerlessSpec

index_name = "finance-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384, #Equal to the dimension of the HuggingFace embedding model
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws", #Cosine similarity
            region="us-east-1"
        )
    )

index = pc.Index(index_name)

In [54]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents = text_chunks,
    embedding = embeddings,
    index_name = index_name
)

### Add more data into existing Pinecone index

In [55]:
new_doc_add = Document(
    page_content="This is a new docuement to add",
    metadata = {"source": "NewDoc"}
)

In [56]:
docsearch.add_documents(
    documents=[new_doc_add]
)

['e211c430-0892-4541-a6f9-2bcfc116f87a']

In [57]:
retriever = docsearch.as_retriever(
    search_type="similarity",
    search_kwargs={"k":3}
)

In [None]:
retrieved_docs = retriever.invoke("What is the cost of revenue on 2024")
retrieved_docs

[Document(id='c0cd235f-2dbf-47d5-b60c-20ad32f42c03', metadata={'source': 'data\\NASDAQ_META_2024.pdf'}, page_content='2024 2023 2022 2024 vs 2023 %change 2023 vs 2022 %change\n(in millions, except percentages)\nCost of revenue $ 30,161 $ 25,959 $ 25,249 16 % 3 %\nPercentage of revenue 18 % 19 % 22 %\nCost of revenue in 2024 increased $4.20 billion, or 16%, compared to 2023. The increase was primarily due to higher operational expenses related to our\ndata centers and technical infrastructure, mostly from higher depreciation expense.\nSee Note 7 — Property and Equipment in the notes to the consolidated financial statements included in Part II, Item 8, "Financial Statements and\nSupplementary Data" of this Annual Report on Form 10-K for additional information regarding depreciation expense.\nResearch and development\nYear Ended December 31,\n2024 2023 2022 2024 vs 2023 %change 2023 vs 2022 %change\n(in millions, except percentages)\nResearch and development $ 43,873 $ 38,483 $ 35,338 14 

In [None]:
retrieved_docs = retriever.invoke("What are the products of Meta")
retrieved_docs

[Document(id='c9b31116-3f1c-433b-a501-c5fe21a424ec', metadata={'source': 'data\\NASDAQ_META_2024.pdf'}, page_content='For our RL products, our sales and operations efforts utilize third-party sales channels such as retailers, resellers, and our direct-to-consumer channel,\nMeta.com. These efforts are focused on driving consumer and enterprise sales and adoption of our Meta Quest portfolio of products and Ray-Ban Meta AI\nglasses.\nMarketing\nHistorically, our communities have generally grown organically with people inviting their friends to connect with them, supported by internal efforts to\nstimulate awareness and interest. In addition, we have invested and will continue to invest in marketing our products and services to grow our brand and help\nbuild community around the world.\n8'),
 Document(id='8d0bbb29-3680-4b6e-bb4e-d02cdcd632ed', metadata={'source': 'data\\NASDAQ_META_2024.pdf'}, page_content='on the Nasdaq Global Select Market under the symbol "META." Our principal executive

In [60]:
retrieved_docs = retriever.invoke("List down top 3 products of Meta")
retrieved_docs

[Document(id='c9b31116-3f1c-433b-a501-c5fe21a424ec', metadata={'source': 'data\\NASDAQ_META_2024.pdf'}, page_content='For our RL products, our sales and operations efforts utilize third-party sales channels such as retailers, resellers, and our direct-to-consumer channel,\nMeta.com. These efforts are focused on driving consumer and enterprise sales and adoption of our Meta Quest portfolio of products and Ray-Ban Meta AI\nglasses.\nMarketing\nHistorically, our communities have generally grown organically with people inviting their friends to connect with them, supported by internal efforts to\nstimulate awareness and interest. In addition, we have invested and will continue to invest in marketing our products and services to grow our brand and help\nbuild community around the world.\n8'),
 Document(id='24a949a8-9e39-48f7-a6a1-9378c8d2d5fe', metadata={'source': 'data\\NASDAQ_META_2024.pdf'}, page_content='Table of Contents\nWe face inventory risk with respect to our consumer hardware pro

In [86]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [94]:
# Let's refine the response
from langchain_openai import ChatOpenAI
llm_chatmodel = ChatOpenAI(model="gpt-4o")

In [95]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [96]:
system_prompt = (
    "You are an financial assistant for question-answering tasks."
    "Use the following prices of retrieved context to answer"
    "the question. if you don't know the answer, say that you"
    "don't kow. Use three sentence maximum and keep the"
    "answer concise"
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]    
)

In [97]:
question_answer_chain = create_stuff_documents_chain(llm_chatmodel,prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [102]:
response = rag_chain.invoke({"input": "What is the number of employees of this organization"})
print(response["answer"])

The organization had a global workforce of 74,067 employees as of December 31, 2024.
