In [1]:
# !pip install langchain langchain-community 
# !pip install pinecone
# !pip install unstructured[pdf]
# !pip install libmagic

## Data Loading and Chunking

In [2]:
!python -V

Python 3.11.11


In [3]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from langchain.embeddings import HuggingFaceEmbeddings
import os
import langchain_community.vectorstores

  from tqdm.autonotebook import tqdm


In [4]:
text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=600
        )
loader = DirectoryLoader('knowledge_base', glob="*.pdf", show_progress=True, use_multithreading=True)
data = loader.load()

for doc in data:
    doc.metadata['source'] = doc.metadata.get('source', 'Unknown')

docs = text_splitter.split_documents(data)
document_texts = [doc.page_content for doc in docs]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:35<00:00, 11.79s/it]


## VectorStore and Query Engine Setup

In [5]:
load_dotenv()

True

In [6]:
import os

embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)

import pinecone
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
# pc = Pinecone(api_key = secret_value_0)
index_name = "internship-assignment"


if index_name not in pc.list_indexes().names():
    pc.create_index(name=index_name,
                    metric="cosine",
                    dimension=768,
                    spec=ServerlessSpec(
                        cloud="aws",
                        region="us-east-1"
                ))
    docsearch = langchain_community.vectorstores.Pinecone.from_documents(docs, embeddings, index_name=index_name)
else:
    docsearch = langchain_community.vectorstores.Pinecone.from_existing_index(embedding=embeddings, index_name=index_name)

print("Pipeline setup complete!")

  embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)


Pipeline setup complete!


In [7]:
docsearch

<langchain_community.vectorstores.pinecone.Pinecone at 0x7ed26a847d90>

## ChatBot and LLM init 

In [8]:
from langchain.memory import ConversationBufferMemory
from langchain_core.prompts import PromptTemplate
from langchain.chains.question_answering.chain import load_qa_chain
from langchain_ollama.llms import OllamaLLM
from langchain_core.output_parsers import StrOutputParser

In [9]:
class LLM():
    def __init__(self, prompt="Hi"):
        self.clear_memory = True
        self.model = None
        self.chat_history = []
        self.prompt = None
        self.memory = None
        self.template = """ 
     You are a helpful chatbot.
     You are provided with information from these 3 documents
    1. Alphabet Inc. Form 10-K
    2. Tesla, Inc. Form 10-K
    3. Uber Technologies, Inc. Form 10-K
    Your task is to retrieve the content from these PDFs, compare
    them, and answer queries highlighting the information across all
    documents.
     Use the provided context to answer the user's question accurately. Always consider the user's chat history for better understanding and personalized responses.
                            Here is the information you have:
    
                            Context: 
                            {context}
    
                            Chat History: 
                            {chat_history}
    
                            User's Question: 
                            {question}
    
                            Based on the above information, provide a detailed and accurate answer to the user's question. Remember to stay relevant to the context and maintain professionalism. Your response should be clear, concise, and helpful:
    """


    def Ollama(self, given_prompt):
        user_input = given_prompt
        if self.clear_memory:
            self.model = OllamaLLM(model="llama3.2:3b", temprature = 0.7)
            template = self.template
            self.prompt = PromptTemplate(
                template=template,
                input_variables=["chat_history", "context", "question"]
            )
            self.memory = ConversationBufferMemory(memory_key="chat_history", input_key="question")
            self.clear_memory = False
        chain = load_qa_chain(prompt = self.prompt, llm = self.model, memory = self.memory, chain_type = "stuff")
        print(chain.memory.buffer)
        return chain({
                        "input_documents": docsearch.similarity_search(user_input),
                        "question": user_input
                    }, 
                        return_only_outputs=True
                    )["output_text"]

In [10]:
# class Ollama():
#     def __init__(self):
#         self.clear_memory = True
    
#     def ollamachat(self,given_prompt):
#         user_input = given_prompt
#         if self.clear_memory:
#             model = OllamaLLM(model="llama3.2:3b", temprature = 0.7)
#             template = """ 
#      You are a helpful chatbot.
#      You are provided with information from these 3 documents
#     1. Alphabet Inc. Form 10-K
#     2. Tesla, Inc. Form 10-K
#     3. Uber Technologies, Inc. Form 10-K
#     Your task is to retrieve the content from these PDFs, compare
#     them, and answer queries highlighting the information across all
#     documents.
#      Use the provided context to answer the user's question accurately. Always consider the user's chat history for better understanding and personalized responses.
#                             Here is the information you have:
    
#                             Context: 
#                             {context}
    
#                             Chat History: 
#                             {chat_history}
    
#                             User's Question: 
#                             {question}
    
#                             Based on the above information, provide a detailed and accurate answer to the user's question. Remember to stay relevant to the context and maintain professionalism. Your response should be clear, concise, and helpful:
#     """
#             prompt = PromptTemplate(
#                 template=template,
#                 input_variables=["chat_history", "context", "question"]
#             )
#             memory = ConversationBufferMemory(memory_key="chat_history", input_key="question")
#             self.clear_memory = False
#         chain = load_qa_chain(prompt = prompt, llm = model, memory = memory, chain_type = "stuff")
#         print(chain.memory.buffer)
#         return chain({
#                         "input_documents": docsearch.similarity_search(user_input),
#                         "question": user_input
#                     }, 
#                         return_only_outputs=True
#                     )["output_text"]

In [11]:
given_prompt = "What is the total revenue for Google Search?"

In [12]:
llm  =  LLM()

In [13]:
output = llm.Ollama(given_prompt)

  self.memory = ConversationBufferMemory(memory_key="chat_history", input_key="question")
stuff: https://python.langchain.com/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/docs/how_to/#qa-with-rag
  chain = load_qa_chain(prompt = self.prompt, llm = self.model, memory = self.memory, chain_type = "stuff")





  return chain({


In [14]:
print(output)

Based on the provided information from Alphabet Inc., Form 10-K, I can answer your question about total revenue for Google Search.

According to the table presented in the financial results section of Alphabet Inc.'s Form 10-K, the total revenue for Google Search & other is:

- $148,951 (2021)
- $162,450 (2022)
- $175,033 (2023)

However, if we are looking for the specific number representing the total revenue for Google Search itself, it seems that there isn't a separate line item labeled as such in the provided financial reports.

But since the question pertains to the revenues from "Google Search & other", which includes both search-related and other services (e.g., YouTube ads), we can infer its impact on the overall revenue. The total increase in this segment from 2021 to 2023 is $12.6 billion, indicating significant growth.

If you're looking for a more precise figure related specifically to Google Search or want to explore other aspects of Alphabet's financials, I recommend revi

## Gradio UI

In [15]:
# !pip install gradio

In [16]:
import gradio as gr

In [17]:
llm = LLM()

In [18]:
def gen_output(prompt):
        llm.chat_history.append([prompt, llm.Ollama(prompt)])
        return llm.chat_history

def clear_fn():
    llm.clear_memory = True
    llm.chat_history = []

In [19]:
with gr.Blocks(fill_height = True) as app:
    with gr.Tab("📃 Comparative Analysis of Financial Reports"):
        gr.Markdown('''## 🤖 This application is developed as part of an assignment for an internship at Alemeno. 
                          It allows you to analyze and compare Form 10-K filings of multinational companies, including 
                          - Alphabet Inc.,
                          - Tesla Inc., and
                          - Uber Technologies Inc.
                        ''')
        with gr.Column():
            with gr.Column(scale = 25):
                with gr.Group():
                    chatbox = gr.Chatbot(label = "🔭 ChatBot Panel", show_copy_button = True, height=480)
                    textbox = gr.Textbox(show_label = False, placeholder = "👉 Enter your query")

                    textbox.submit(
                        fn = gen_output,
                        inputs = textbox,
                        outputs = chatbox
                    )
                    submit_button = gr.Button("Submit")
                    submit_button.click(
                        fn = gen_output,
                        inputs = textbox,
                        outputs = chatbox
                    )

                clear = gr.ClearButton([textbox, chatbox],value = "Clear Memory and Start New Chat")
                clear.click(fn= clear_fn)
        
app.launch()



* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


