In [40]:
!pip install langchain faiss-cpu PyPDF2 tabula-py pandas langchain-community langchain-groq




In [41]:
import os
import tabula
import PyPDF2
import pandas as pd
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain_groq import ChatGroq
from langchain.schema import SystemMessage, HumanMessage

In [42]:
api_key = "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"

In [66]:
llm = ChatGroq(model_name="llama-3.2-1b-preview",groq_api_key = api_key)

In [44]:
pdf_path = "/content/combined_document_10.pdf"

In [45]:
def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file using PyPDF2."""
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

In [46]:
def extract_tables_from_pdf(pdf_path):
    """Extracts tables from a PDF file using tabula."""
    tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)
    return tables

def preprocess_text(text):
    """Splits text into chunks for indexing."""
    text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    return text_splitter.split_text(text)

def preprocess_tables(tables):
    """Converts tables to text format for vectorization."""
    table_texts = []
    for df in tables:
        table_texts.append(df.to_csv(index=False))  # Convert table to CSV-like text
    return table_texts

In [47]:
print("Extracting text and tables from PDF...")
raw_text = extract_text_from_pdf(pdf_path)
tables = extract_tables_from_pdf(pdf_path)

# Preprocess text and tables
text_chunks = preprocess_text(raw_text)
table_chunks = preprocess_tables(tables)

# Combine text and table data
data_chunks = text_chunks + table_chunks


Extracting text and tables from PDF...


Mar 11, 2025 6:57:38 AM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Mar 11, 2025 6:57:45 AM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Mar 11, 2025 6:57:46 AM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Mar 11, 2025 6:57:46 AM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Mar 11, 2025 6:57:47 AM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Mar 11, 2025 6:57:47 AM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Mar 11, 2025 6:57:47 AM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Mar 11, 2025 6:57:48 AM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Mar 11, 2025 6:57:48 AM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>



In [48]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create FAISS vector store
vector_db = FAISS.from_texts(data_chunks, embedding_model)

# Save FAISS index
vector_db.save_local("faiss_index")

print("Vector database created successfully!")

Vector database created successfully!


In [71]:
def retrieve_and_generate(query):
    """Retrieves relevant document chunks and generates a response using Groq LLM."""
    # Search in FAISS vector database
    search_results = vector_db.similarity_search(query, k=1)
    context = "\n".join([doc.page_content for doc in search_results])

    # Prepare messages for LLM
    messages = [
        SystemMessage(content="You are a helpful AI assistant that answers financial document queries accurately."),
        HumanMessage(content=f"Using the following document context, answer the query:\n\n{context}\n\nQuery: {query}")
    ]

    # Generate response using Groq LLM
    response = llm(messages)
    return response.content

In [56]:
query = "How much did Apple's Services segment contribute to total net sales in 2018, and what was the year-over-year growth percentage? "
response = retrieve_and_generate(query)

print("\nGenerated Response:\n", response)


Generated Response:
 According to the document, the Services segment for 2018 had the following values:

- Net sales: $37,190
- Year-over-year growth percentage: 24%

To find the contribution to total net sales, we need to look at the total net sales for 2018. However, the document does not explicitly provide the total net sales for 2018, but it does show the net sales for 2018 under 'Unnamed: 0', which is $20,453, $22,090, $23,317, and $89,950.

Since the years 2018 and the corresponding years are not explicitly mentioned in the same row, we can use the year 2018 corresponding to each of the mentioned values:

- Year 2017: $20,453
- Year 2017: $22,090
- Year 2017: $23,317
- Year 2018: $89,950

The contribution of the Services segment to total net sales in 2018 can be calculated as follows:

Total net sales for 2018 = $89,950
Contribution of Services segment = $37,190
Year-over-year growth percentage = ( ($37,190 / $89,950) * 100 ) - 1 ≈ 41.8%

So, the Services segment contributed app

In [67]:
query2 = "How many shares did Microsoft repurchase in fiscal year 2016,and what was the total amount spent?  "
response2 = retrieve_and_generate(query2)

print("\nGenerated Response:\n", response2)


Generated Response:
 According to the document, Microsoft repurchased the following shares of common stock under the share repurchase plan in fiscal year 2016:

(In millions) 
Shares  Amount  Shares  Amount  Shares  Amount
October  1, 2016  $  5,000  $  4,500  $  4,500  $  4,500  $  4,500
November  19, 2016  $  2,000  $  1,900  $  1,900  $  1,900  $  1,900
December  10, 2016  $  2,842  $  2,700  $  2,700  $  2,700  $  2,700
March  10, 2016  $  2,821  $  2,700  $  2,700  $  2,700  $  2,700
June  9, 2016  $  2,821  $  2,700  $  2,700  $  2,700  $  2,700
June 14, 2016  $  2,811  $  2,700  $  2,700  $  2,700  $  2,700

Total amount spent: $20,380 million


In [68]:
q3 ="What was the dividend per share declared by Microsoft in September 2015?"
r3 = retrieve_and_generate(q3)
print("\nGenerated Response:\n", r3)


Generated Response:
 To find the dividend declared by Microsoft in September 2015, we need to look for the "Declaration Date" and "Per Share" columns in the provided date range.

From the provided information, we can find the following dates:

* September 16, 2014
* November 20, 2014
* March 10, 2015

We can see that the "Per Share" column for September 16, 2014 is $0.31.

Therefore, the dividend declared by Microsoft in September 2015 was $0.31 per share.


In [72]:
queries = [
    "How much did Apple spend on Research and Development in fiscal year 2018, and how did it change compared to 2017?",
    "What were the total iPhone sales figures (in units) for Apple in 2018, and how did this compare to the previous year?",
    "How many shares did Microsoft repurchase in fiscal year 2016, and what was the total amount spent?",
    "What was Apple's net sales figure for the Americas region in 2018, and what percentage of total net sales did this represent?",
    "When did Microsoft acquire LinkedIn Corporation according to the quarterly information?",
    "What was the dividend per share declared by Microsoft in September 2015?",
    "What factors contributed to the increase in iPad net sales during 2018 compared to 2017?",
    "How much did Apple's Services segment contribute to total net sales in 2018, and what was the year-over-year growth percentage?",
    "What were the main components of Microsoft's 'Other Income (Expense), Net' for fiscal year 2018?",
    "What was Apple's gross margin percentage range anticipated for the first quarter of 2019?"
]

# Run queries and print responses
for i, query in enumerate(queries, 1):
    print(f"Q{i}: {query}")
    print(f"A{i}: {retrieve_and_generate(query)}\n")

Q1: How much did Apple spend on Research and Development in fiscal year 2018, and how did it change compared to 2017?
A1: To answer your query, I need to extract the relevant data from the given document.

According to the document, the expenditure for Research and Development (R&D) for Apple in fiscal years 2016, 2015, and 2014 is as follows:

1. 2016:
   * Year Ended June 30: 2016
   * R&D Expenditure: $89.0

2. 2015:
   * Year Ended June 30: 2015
   * R&D Expenditure: $66.0

3. 2014:
   * Year Ended June 30: 2014
   * R&D Expenditure: $66.0

Now, let's compare the R&D expenditure for 2016 and 2017:

2017: $89.0
Difference: $89.0 - $66.0 = $23.0

So, Apple spent $23.0 less on Research and Development in 2017 compared to 2016.

Q2: What were the total iPhone sales figures (in units) for Apple in 2018, and how did this compare to the previous year?
A2: To find the total iPhone sales figures for Apple in 2018, we need to sum up the sales figures for iPhone.

The iPhone sales figures for