## Installations

In [None]:
!pip install transformers==4.33.0 accelerate==0.22.0 einops==0.6.1 langchain==0.0.300 xformers==0.0.21 \
bitsandbytes==0.41.1 sentence_transformers==2.2.2 chromadb==0.4.12
!pip install PyPDF2
!pip install evaluate rouge_score
!pip install openai==0.28
!pip install pdfplumber

Collecting transformers==4.33.0
  Downloading transformers-4.33.0-py3-none-any.whl.metadata (119 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/119.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m112.6/119.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.9/119.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate==0.22.0
  Downloading accelerate-0.22.0-py3-none-any.whl.metadata (17 kB)
Collecting einops==0.6.1
  Downloading einops-0.6.1-py3-none-any.whl.metadata (12 kB)
Collecting langchain==0.0.300
  Downloading langchain-0.0.300-py3-none-any.whl.metadata (15 kB)
Collecting xformers==0.0.21
  Downloading xformers-0.0.21-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes==0.41.1
  Downloading bitsandbytes-0.41.1-py3-none-any.whl.metadata (9.8 kB)
Collecting sentence_tran

## Imports of Required Libraries

In [None]:
from torch import cuda, bfloat16
import torch
import transformers
from transformers import AutoTokenizer
from time import time
#import chromadb
#from chromadb.config import Settings
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma

In [None]:
import kagglehub

# Download latest version
path = kagglehub.model_download("metaresearch/llama-2/pyTorch/7b-chat-hf")

print("Path to model files:", path)

Path to model files: /kaggle/input/llama-2/pytorch/7b-chat-hf/1


## Definig Model

In [None]:
model_id = '/kaggle/input/llama-2/pytorch/7b-chat-hf/1'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

## Model and Tokenizer Preparation

In [None]:
time_1 = time()
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
time_2 = time()
print(f"Prepare model, tokenizer: {round(time_2-time_1, 3)} sec.")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Prepare model, tokenizer: 153.182 sec.




## Pipeline Initialization

In [None]:
time_1 = time()
query_pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.float16,
        device_map="auto",)
time_2 = time()
print(f"Prepare pipeline: {round(time_2-time_1, 3)} sec.")

Prepare pipeline: 3.068 sec.


## Model Testing and Inference

In [None]:
def test_model(tokenizer, pipeline, prompt_to_test):
    """
    Perform a query
    print the result
    Args:
        tokenizer: the tokenizer
        pipeline: the pipeline
        prompt_to_test: the prompt
    Returns
        None
    """
    # adapted from https://huggingface.co/blog/llama2#using-transformers
    time_1 = time()
    sequences = pipeline(
        prompt_to_test,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=200,)
    time_2 = time()
    print(f"Test inference: {round(time_2-time_1, 3)} sec.")
    for seq in sequences:
        print(f"Result: {seq['generated_text']}")

## PDF Text Extraction for Knowledge Base

In [None]:
import re
import pandas as pd
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
import PyPDF2

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)  # Updated to PdfReader
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

pdf_path = '/kaggle/input/knowledge-base/Part3_knowledge base.pdf'
raw_text = extract_text_from_pdf(pdf_path)


## Text Cleaning, Standardizing and Chunking for Preprocessing.

In [None]:
def clean_text(text):
    # Remove headers, footers
    text = re.sub(r'(\n)+', ' ', text)  # Replace line breaks with space
    text = re.sub(r'Page \d+', '', text)  # Remove page numbers

    # Remove irrelevant characters or symbols
    text = text.replace('•', '').replace('—', '-')
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII chars

    return text

cleaned_text = clean_text(raw_text)
def standardize_financial_data(text):
    # Convert billions to millions for consistency
    text = re.sub(r'\$([\d.]+) billion', lambda x: f"{float(x.group(1)) * 1000} million", text)
    text = re.sub(r'\$([\d,]+)', lambda x: x.group(1).replace(",", ""), text)  # Remove commas in numbers

    # Standardize date format
    text = re.sub(r'(\w{3,9} \d{1,2}, \d{4})', lambda x: pd.to_datetime(x.group(1)).strftime('%Y-%m-%d'), text)

    return text

standardized_text = standardize_financial_data(cleaned_text)
def chunk_text(text, chunk_size=500):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=100)
    chunks = text_splitter.split_text(text)
    return [{"text": chunk, "metadata": {"section": "Unknown"}} for chunk in chunks]

text_chunks = chunk_text(standardized_text)


In [None]:
print(standardized_text[:1000])

NEWS RELEASE Meta Reports Fourth Quarter and Full Year 2023 Results; Initiates Quarterly Dividend MENLO PARK, Calif. , Feb. 1, 2024  /PRNewswire/ -- Meta Platforms, Inc. (Nasdaq: META) today reported  nancial results for the quarter and full year ended 2023-12-31. "We had a good quarter as our community and business continue to grow," said Mark Zuckerberg, Meta founder and CEO. "We've made a lot of progress on our vision for advancing AI and the metaverse."    Fourth Quarter and Full Year 2023 Financial Highlights  Three Months Ended December 31,    % Change Twelve Months Ended December 31,   % Change In millions, except percentages and per share amounts     2023  2022   2023  2022   Revenue $ 40,111  $ 32,165 25 % $ 134,902  $ 116,609  16 % Costs and expenses  23,727   25,766 (8) %  88,151   87,665 1 % Income from operations $ 16,384  $ 6,399 156 % $ 46,751  $ 28,944 62 % Operating margin  41 %  20 %    35 %  25 %   Provision for income taxes $ 2,791 $ 1,497 86 % $ 8,330 $ 5,619 48 % 

In [None]:
# Define end keywords (example: next section or known delimiter)
end_keywords = ["management commentary", "business updates", "key metrics"]

# Find the end index using the first matching end keyword
end_index = len(standardized_text)  # Default to the end of the text
for keyword in end_keywords:
    idx = standardized_text.lower().find(keyword)
    if idx > start_index:  # Ensure the end index is after the start index
        end_index = min(end_index, idx)

# Extract the Financial Highlights section
financial_highlights_section = standardized_text[start_index:end_index]



In [None]:
# Clean extracted text for readability
def refine_extracted_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove excessive whitespace
    return text.strip()

financial_highlights_cleaned = refine_extracted_text(financial_highlights_section)



## Testing Retrieval-Augmented Generation with Chroma DB Vector Store

In [None]:
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

# Convert each chunk into a Document object
documents = [Document(page_content=chunk["text"], metadata=chunk["metadata"]) for chunk in text_chunks]

# Initialize embeddings with the specified model
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# Create the Chroma vector store from documents
vectordb = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory="chroma_db")


### Retriever and Retrieval-based Question Answering Setup
Initializing a retriever from a vector database and sets up a retrieval-based QA pipeline using a specified language model (llm) and retriever for answering queries based on retrieved documents.

In [None]:
from langchain.llms import HuggingFacePipeline

# Wrap the query_pipeline in HuggingFacePipeline to use it as an LLM in LangChain
llm = HuggingFacePipeline(pipeline=query_pipeline)


retriever = vectordb.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    verbose=True
)

### Testing Retrieval-Augmented Generation (RAG)
This function tests the RAG system by running a query through the QA pipeline, measuring inference time, and printing the query and its result.

In [None]:
def test_rag(qa, query):
    print(f"Query: {query}\n")
    time_1 = time()
    result = qa.run(query)
    time_2 = time()
    print(f"Inference time: {round(time_2-time_1, 3)} sec.")
    print("\nResult: ", result)

### Testing with a query from PDF

In [None]:
query = "How did Meta's workforce grow or change by the end of 2023? Provide specific numbers and any relevant financial data."

test_rag(qa, query)

## Rag Pipeline using Llama-2 and Chromadb

In [None]:
import evaluate

# Load ROUGE evaluation metric
rouge = evaluate.load("rouge")

# Load questions and reference answers from the PDF file
from PyPDF2 import PdfReader

def load_questions_answers(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()

    questions = []
    reference_answers = []

    # Extract questions and answers
    for line in text.split("\n"):
        if line.startswith("Question"):
            questions.append(line.split(": ", 1)[1].strip())
        elif line.startswith("Answer"):
            reference_answers.append(line.split(": ", 1)[1].strip())

    return questions, reference_answers

# Load from the provided PDF
pdf_path = '/kaggle/input/manual-questions/ManualQuestions.pdf'
questions, reference_answers = load_questions_answers(pdf_path)

# Generate answers using the model
generated_answers = []
for question in questions:
    generated_answer = qa.run(question)  # Assuming `qa` is the setup RetrievalQA chain
    generated_answers.append(generated_answer)

# Calculate ROUGE scores
rouge_scores = rouge.compute(predictions=generated_answers, references=reference_answers)
print("ROUGE scores:", rouge_scores)




[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
ROUGE scores: {'rouge1': 0.2826429437193082, 'rouge2': 0.1208655020561837, 'rougeL': 0.23514646750177573, 'rougeLsum': 0.23810683702911686}


## Rag pipeline using prompt

In [None]:
import pdfplumber
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from time import time
import evaluate

# Define function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    questions = []
    reference_answers = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            lines = text.split('\n')
            # Assuming questions and reference answers are structured as Q1, A1, Q2, A2...
            for i, line in enumerate(lines):
                if line.startswith("Q"):  # Identifying question lines
                    questions.append(line[3:].strip())
                elif line.startswith("A"):  # Identifying answer lines
                    reference_answers.append(line[3:].strip())
    return questions, reference_answers

# Path to the PDF file
pdf_path = "/kaggle/input/manual-questions/ManualQuestions.pdf"

# Extract questions and reference answers from the PDF
questions, reference_answers = extract_text_from_pdf(pdf_path)

# Assuming standardized_text is already defined
def chunk_text(text, chunk_size=1000):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=20)
    chunks = text_splitter.split_text(text)
    return [{"text": chunk, "metadata": {"section": "Unknown"}} for chunk in chunks]

text_chunks = chunk_text(standardized_text)

# Convert each chunk into a Document object
documents = [Document(page_content=chunk["text"], metadata=chunk["metadata"]) for chunk in text_chunks]

# Initialize embeddings with the specified model
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# Create the Chroma vector store from documents
vectordb = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory="chroma_db")

retriever = vectordb.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    verbose=True
)

# Generate answers using the model
generated_answers = []
for question in questions:
    preamble = (
        "Based on Meta's Q4 2023 financial report and related highlights, "
        "please provide a detailed answer to the following question: "
    )
    refined_prompt = f"{preamble}{question}"

    # Generate answer
    print(f"Prompt: {refined_prompt}\n")
    generated_answer = qa.run(refined_prompt)
    generated_answers.append(generated_answer)

# Load ROUGE evaluation metric
rouge = evaluate.load("rouge")

# Calculate ROUGE scores
rouge_scores = rouge.compute(predictions=generated_answers, references=reference_answers)

# Print results
print("\nGenerated Answers:")
for i, (q, ans) in enumerate(zip(questions, generated_answers)):
    print(f"Q{i + 1}: {q}\nA: {ans}\n")
print("Improved ROUGE scores:", rouge_scores)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Prompt: Based on Meta's Q4 2023 financial report and related highlights, please provide a detailed answer to the following question: stion 1: How did Meta’s workforce change by the end of 2023?



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Prompt: Based on Meta's Q4 2023 financial report and related highlights, please provide a detailed answer to the following question: stion 2: What is the report quarter, and when did it end?



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Prompt: Based on Meta's Q4 2023 financial report and related highlights, please provide a detailed answer to the following question: stion 3: What were the key financial highlights this quarter (revenue, gross margin, operating



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Prompt: Based on Meta's Q4 2023 financial report and related highlights, please provide a detailed answer to the following question: stion 4: How much did Meta spend on restructuring for the whole year and Q4?



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Prompt: Based on Meta's Q4 2023 financial report and related highlights, please provide a detailed answer to the following question: stion 5: What happened with Meta’s ad impressions and average price per ad in Q4 and for the



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Prompt: Based on Meta's Q4 2023 financial report and related highlights, please provide a detailed answer to the following question: stion 6: What’s the revenue outlook for Q1 2024?



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Prompt: Based on Meta's Q4 2023 financial report and related highlights, please provide a detailed answer to the following question: stion 7: What were Meta’s total costs and expenses for Q4 and the full year 2023?



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Prompt: Based on Meta's Q4 2023 financial report and related highlights, please provide a detailed answer to the following question: stion 8: How much cash and marketable securities did Meta have on hand as of December 31,



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Prompt: Based on Meta's Q4 2023 financial report and related highlights, please provide a detailed answer to the following question: stion 9: What were the main areas Meta invested in during 2023?



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Prompt: Based on Meta's Q4 2023 financial report and related highlights, please provide a detailed answer to the following question: stion 10: How did the Family of Apps and Reality Labs perform in Q4 2023?



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Prompt: Based on Meta's Q4 2023 financial report and related highlights, please provide a detailed answer to the following question: stion 11: How much free cash flow did Meta generate in Q4 and the full year 2023?



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Prompt: Based on Meta's Q4 2023 financial report and related highlights, please provide a detailed answer to the following question: stion 12: Did Meta make any changes to its stock repurchase program or dividends for 2024?



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Prompt: Based on Meta's Q4 2023 financial report and related highlights, please provide a detailed answer to the following question: stion 13: What risks did Meta highlight for 2024?



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Prompt: Based on Meta's Q4 2023 financial report and related highlights, please provide a detailed answer to the following question: stion 14: What drove Meta’s revenue growth in Q4 2023?



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Prompt: Based on Meta's Q4 2023 financial report and related highlights, please provide a detailed answer to the following question: stion 15: How did Reality Labs perform throughout 2023, and what’s Meta’s plan for 2024?



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m

Generated Answers:
Q1: stion 1: How did Meta’s workforce change by the end of 2023?
A: Meta's workforce decreased by 22% year-over-year, with a headcount of 67,317 as of the end of 2023.

Q2: stion 2: What is the report quarter, and when did it end?
A: The reported quarter is the fourth quarter of 2023, which ended on December 31, 2023.

Q3: stion 3: What were the key financial highlights this quarter (revenue, gross margin, operating
A: The key financial highlights for Meta in Q4 2023 were as follows:
- Revenue: $40,111 million, a 25% increase compared to the same quarter in 2022.
- Gross Margin: Operating margin was 41%, up from 20% in the same quarter last year.
- Operating Income: $16,384 million, a significant increase of 156% compared to Q4 2022.
- Long-term debt: $18,390 million as of December 31, 2023.
- Headcount: 67,317 employees as of December 31, 2023, a 22% decrease year-over-year.
- Quarterly Dividend: Meta initiated a quarterly cash dividend o

## Rag pipeline using openai

In [None]:
import pdfplumber
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
import evaluate
import time

# Function to extract questions and answers from the PDF
def extract_text_from_pdf(pdf_path):
    questions = []
    reference_answers = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            lines = text.split('\n')
            # Assuming questions start with "Q" and answers start with "A"
            for i, line in enumerate(lines):
                if line.startswith("Q"):  # Identifying question lines
                    questions.append(line[3:].strip())
                elif line.startswith("A"):  # Identifying answer lines
                    reference_answers.append(line[3:].strip())
    return questions, reference_answers

# Path to the PDF file
pdf_path = "/kaggle/input/manual-questions/ManualQuestions.pdf"

# Extract questions and reference answers from the PDF
enhanced_prompts, reference_answers = extract_text_from_pdf(pdf_path)

# Load ROUGE evaluation metric
rouge = evaluate.load("rouge")

# Chunking function
def chunk_text(text, chunk_size=1000):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=20)
    chunks = text_splitter.split_text(text)
    return [{"text": chunk, "metadata": {"section": "Unknown"}} for chunk in chunks]

# Process text into chunks (Assuming standardized_text is defined elsewhere)
text_chunks = chunk_text(standardized_text)

# Convert chunks into Document objects
documents = [Document(page_content=chunk["text"], metadata=chunk["metadata"]) for chunk in text_chunks]

# Initialize embeddings and vector store
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

vectordb = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory="chroma_db")
retriever = vectordb.as_retriever()

# Initialize the LLM with gpt-3.5-turbo
llm = OpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key="****************************************************")

# Define the RetrievalQA pipeline with the updated model
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    verbose=True
)

# Generate answers using the model with enhanced prompts
generated_answers = []
for question in enhanced_prompts:
    print(f"Processing Question: {question}\n")
    start_time = time.time()
    result = qa.run(question)
    end_time = time.time()
    generated_answers.append(result)
    print(f"Processed in {round(end_time - start_time, 3)} seconds.\nResult: {result}\n")

# Calculate ROUGE scores
rouge_scores = rouge.compute(predictions=generated_answers, references=reference_answers)

# Print results
print("\nGenerated Answers:")
for i, (q, ans) in enumerate(zip(enhanced_prompts, generated_answers)):
    print(f"Q{i + 1}: {q}\nA: {ans}\n")
print("Improved ROUGE scores:", rouge_scores)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing Question: stion 1: How did Meta’s workforce change by the end of 2023?



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 0.433 seconds.
Result: Meta's workforce decreased by 22% by the end of 2023.

Processing Question: stion 2: What is the report quarter, and when did it end?



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 0.49 seconds.
Result: The report is for the fourth quarter of 2023, which ended on December 31, 2023.

Processing Question: stion 3: What were the key financial highlights this quarter (revenue, gross margin, operating



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 0.755 seconds.
Result: Revenue for the fourth quarter of 2023 was $40,110.0 million, an increase of 25% year-over-year. Total costs and expenses were $23,730.0 million, a decrease of 8% year-over-year. Share repurchases totaled $6,320.0 million for the quarter.

Processing Question: stion 4: How much did Meta spend on restructuring for the whole year and Q4?



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 0.487 seconds.
Result: Meta spent $610 million on restructuring for the whole year and $200 million in Q4.

Processing Question: stion 5: What happened with Meta’s ad impressions and average price per ad in Q4 and for the



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 0.739 seconds.
Result: In Q4 of 2023, Meta's ad impressions increased by 21% year-over-year, and the average price per ad increased by 2% year-over-year. For the full year 2023, ad impressions increased by 28% year-over-year, and the average price per ad decreased by 9% year-over-year.

Processing Question: stion 6: What’s the revenue outlook for Q1 2024?



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 0.426 seconds.
Result: The revenue outlook for Q1 2024 is expected to be in the range of 34.5-37 billion.

Processing Question: stion 7: What were Meta’s total costs and expenses for Q4 and the full year 2023?



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 0.556 seconds.
Result: Meta's total costs and expenses for Q4 2023 were $3,452 million, and for the full year 2023, they were $11,480 million.

Processing Question: stion 8: How much cash and marketable securities did Meta have on hand as of December 31,



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 0.423 seconds.
Result: Meta had $65.4 billion in cash, cash equivalents, and marketable securities as of December 31.

Processing Question: stion 9: What were the main areas Meta invested in during 2023?



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 0.387 seconds.
Result: Meta invested in advancing AI and the metaverse during 2023.

Processing Question: stion 10: How did the Family of Apps and Reality Labs perform in Q4 2023?



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 0.525 seconds.
Result: I'm sorry, the provided context does not include information about how the Family of Apps and Reality Labs performed in Q4 2023.

Processing Question: stion 11: How much free cash flow did Meta generate in Q4 and the full year 2023?



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 0.438 seconds.
Result: Meta generated free cash flow of 11500.0 million in Q4 and 43010.0 million for the full year 2023.

Processing Question: stion 12: Did Meta make any changes to its stock repurchase program or dividends for 2024?



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 0.538 seconds.
Result: Yes, Meta announced an increase in its share repurchase authorization and initiated a quarterly cash dividend for its outstanding common stock in 2024.

Processing Question: stion 13: What risks did Meta highlight for 2024?



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 0.612 seconds.
Result: Meta highlighted risks such as new product developments, emphasis on community growth, privacy concerns, competition, government actions, litigation, security breaches, and market conditions for 2024.

Processing Question: stion 14: What drove Meta’s revenue growth in Q4 2023?



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 0.468 seconds.
Result: Meta's revenue growth in Q4 2023 was primarily driven by a 25% increase in revenue compared to the same period in 2022.

Processing Question: stion 15: How did Reality Labs perform throughout 2023, and what’s Meta’s plan for 2024?



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 0.611 seconds.
Result: Reality Labs experienced operating losses in 2023 due to ongoing product development efforts in augmented reality/virtual reality. For 2024, Meta plans to increase investments in Reality Labs to further scale the ecosystem, which is expected to lead to a meaningful increase in operating losses year-over-year.


Generated Answers:
Q1: stion 1: How did Meta’s workforce change by the end of 2023?
A: Meta's workforce decreased by 22% by the end of 2023.

Q2: stion 2: What is the report quarter, and when did it end?
A: The report is for the fourth quarter of 2023, which ended on December 31, 2023.

Q3: stion 3: What were the key financial highlights this quarter (revenue, gross margin, operating
A: Revenue for the fourth quarter of 2023 was $40,110.0 million, an increase of 25% year-over-year. Total costs and expenses were $23,730.0 million, a decrease of 8% year-over-year. Share repurchases totaled $6,320.0 million for the quar

## Rag pipline using openai and enhanced prompts

In [None]:
from langchain.chains import RetrievalQA
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from transformers import pipeline
import time
import torch
import evaluate

# Load ROUGE evaluation metric
rouge = evaluate.load("rouge")

# Chunking function
def chunk_text(text, chunk_size=1000, chunk_overlap=20):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_text(text)
    return [{"text": chunk, "metadata": {"section": "Unknown"}} for chunk in chunks]

# Summarize chunks
def summarize_chunks(retrieved_docs, summarization_threshold=512):
    device = 0 if torch.cuda.is_available() else -1
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)
    context = " ".join([doc.page_content for doc in retrieved_docs])
    if len(context.split()) < summarization_threshold:
        return context  # Skip summarization for short texts
    summaries = []
    for doc in retrieved_docs:
        input_length = len(doc.page_content.split())
        max_length = min(300, input_length // 2)
        min_length = max(50, input_length // 4)
        summary = summarizer(doc.page_content, max_length=max_length, min_length=min_length, do_sample=False)
        summaries.append(summary[0]['summary_text'])
    return " ".join(summaries)

# Process text into chunks
text_chunks = chunk_text(standardized_text)  # Use your cleaned and preprocessed text
documents = [Document(page_content=chunk["text"], metadata=chunk["metadata"]) for chunk in text_chunks]

# Initialize embeddings and vector store
model_kwargs = {"device": "cuda"} if torch.cuda.is_available() else {"device": "cpu"}
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs=model_kwargs)

vectordb = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory="chroma_db")
retriever = vectordb.as_retriever(search_kwargs={"k": 10})

# Initialize LLM
llm = OpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key="************************************")

# Define RetrievalQA pipeline
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    verbose=True
)

# Define question prompts
enhanced_prompts = [
    "What was Meta's total headcount at the end of 2023, and how does it compare to the headcount at the end of 2022?",
    "Which quarter of 2023 is being reported, and on what date did the quarter end?",
    "What were Meta's Q4 2023 financial highlights, including revenue, gross margin, operating expenses, operating margin, net income, and EPS? Provide figures and year-over-year comparisons.",
    "How much did Meta spend on restructuring activities in Q4 2023 and the entire year 2023? Provide figures in millions.",
    "What were the year-over-year trends in ad impressions and the average price per ad for Q4 2023 and the entire year?",
    "What is Meta's revenue forecast for Q1 2024? Provide specific ranges if available.",
    "What were Meta's total costs and expenses for Q4 2023 and the entire FY 2023? Include year-over-year comparisons and specific figures in millions.",
    "As of December 31, 2023, how much cash, cash equivalents, and marketable securities did Meta report? Provide the exact figure in millions.",
    "In 2023, where did Meta allocate its major investments? Highlight specific areas such as technology, infrastructure, or other initiatives.",
    "What was the financial performance of the Family of Apps and Reality Labs segments in Q4 2023, including revenue and operating income figures?",
    "What was Meta's free cash flow for Q4 2023 and the entire year 2023?",
    "What changes did Meta make to its stock repurchase program or dividends in 2024? Include details about any new authorizations, dividend policies, or other announcements.",
    "What risks did Meta highlight for 2024? Provide details on regulatory, operational, and market-related risks.",
    "What factors contributed to Meta's revenue growth in Q4 2023? Include key metrics or events driving this growth.",
    "How did Reality Labs perform financially in 2023, and what are Meta's plans for this segment in 2024?"
]

# Generate answers
generated_answers = []
for question in enhanced_prompts:
    print(f"Processing Question: {question}\n")
    start_time = time.time()
    retrieved_docs = retriever.get_relevant_documents(question)
    if not retrieved_docs:
        print(f"No relevant documents found for question: {question}")
        continue

    context = summarize_chunks(retrieved_docs)
    refined_prompt = f"""
    Answer the question concisely based on the context:
    Context:
    {context}
    Question:
    {question}
    """
    try:
        result = qa.run(refined_prompt)
    except Exception as e:
        result = f"Error during processing: {e}"
    generated_answers.append(result)
    print(f"Processed in {round(time.time() - start_time, 3)} seconds.\nResult: {result}\n")

# Calculate ROUGE scores (if you have reference answers)
reference_answers = [
    "By the end of 2023, Meta had 67,317 employees. In 2022, Meta had 87,314 employees, so Meta had a 22.9% decrease in 2023 compared to 2022.",
    "The reported quarter is the fourth quarter of 2023. The quarter ended on December 31, 2023.",
    "In Q4 2023, total revenue was $40,111 million, showing a 25% increase year-over-year. Operating expenses were $23,727 million, reflecting an 8% decrease year-over-year. The operating margin was 41%, up 21 percentage points from the previous year. Net income for Q4 2023 was $14,017 million, marking a 201% year-over-year increase. Earnings per share (EPS) for Q4 2023 were $5.33, representing a 203% increase from the previous year.",
    "Meta spent $461 million on restructuring activities in Q4 2023 and $1,610 million for the entire year 2023.",
    "Ad impressions increased by 28% year-over-year, and the average price per ad decreased by 9% for Q4 2023 and the entire year.",
    "Meta expects revenue for Q1 2024 to range between $34.5 billion and $37 billion.",
    "Total costs and expenses were $23.73 billion for Q4 2023 and $88.15 billion for the full year 2023.",
    "Meta had $65.40 billion in cash, cash equivalents, and marketable securities as of December 31, 2023.",
    "In 2023, Meta allocated major investments towards advancing AI and the metaverse.",
    "Family of Apps segment had a revenue of $39,040 million and Reality Labs segment had a revenue of $1,071 million in Q4 2023. Operating income figures are not provided in the context.",
    "Meta's free cash flow was $11,500.0 million for Q4 2023 and $43,010.0 million for the entire year 2023.",
    "Meta announced a $50 billion increase in their share repurchase authorization and initiated a quarterly dividend of $0.50 per share in 2024.",
    "Meta highlighted risks associated with new products and changes to existing products, including metaverse efforts, as well as risks related to community growth, user experience, brand reputation, privacy, safety, security, and content review efforts. Additionally, they mentioned risks related to government actions, litigation, privacy concerns, acquisitions, security breaches, scale management, and market conditions affecting dividend payments.",
    "Meta's revenue growth in Q4 2023 was driven by a 25% increase in revenue compared to the same period in 2022. This growth was attributed to the continued growth of the community and business, as well as progress made in advancing AI and the metaverse.",
    "Reality Labs performed with increasing operating losses in 2023. Meta plans to continue investing in product development efforts and scaling the ecosystem for Reality Labs in 2024, expecting operating losses to increase meaningfully year-over-year."
]

questions = [
    "How did Meta’s workforce change by the end of 2023?",
    "What is the report quarter, and when did it end?",
    "What were the key financial highlights this quarter (revenue, gross margin, operating expenses, operating margin, net income, and EPS)?",
    "How much did Meta spend on restructuring for the whole year and Q4?",
    "What happened with Meta’s ad impressions and average price per ad in Q4 and for the whole year?",
    "What’s the revenue outlook for Q1 2024?",
    "What were Meta’s total costs and expenses for Q4 and the full year 2023?",
    "How much cash and marketable securities did Meta have on hand as of December 31, 2023?",
    "What were the main areas Meta invested in during 2023?",
    "How did the Family of Apps and Reality Labs perform in Q4 2023?",
    "How much free cash flow did Meta generate in Q4 and the full year 2023?",
    "Did Meta make any changes to its stock repurchase program or dividends for 2024?",
    "What risks did Meta highlight for 2024?",
    "What drove Meta’s revenue growth in Q4 2023?",
    "How did Reality Labs perform throughout 2023, and what’s Meta’s plan for 2024?"
]
rouge_scores = rouge.compute(predictions=generated_answers, references=reference_answers)

# Print results
print("\nGenerated Answers:")
for i, (q, ans) in enumerate(zip(questions, generated_answers)):
    print(f"Q{i + 1}: {q}\nA: {ans}\n")
print("ROUGE scores:", rouge_scores)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing Question: What was Meta's total headcount at the end of 2023, and how does it compare to the headcount at the end of 2022?



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 17.683 seconds.
Result: Meta's total headcount at the end of 2023 was 67,317, which was a decrease of 22% compared to the headcount at the end of 2022.

Processing Question: Which quarter of 2023 is being reported, and on what date did the quarter end?



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your min_length=50 must be inferior than your max_length=5.
Your min_length=50 must be inferior than your max_length=5.
Your min_length=50 must be inferior than your max_length=5.
Your min_length=50 must be inferior than your max_length=5.
Your min_length=50 must be inferior than your max_length=5.




[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 17.189 seconds.
Result: The fourth quarter of 2023 is being reported, and it ended on December 31, 2023.

Processing Question: What were Meta's Q4 2023 financial highlights, including revenue, gross margin, operating expenses, operating margin, net income, and EPS? Provide figures and year-over-year comparisons.



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 17.792 seconds.
Result: In Q4 2023, Meta reported revenue of $40,111 million, a 25% increase year-over-year. The operating margin was 41%, net income was $16,384 million, a 156% increase year-over-year, and EPS was not provided in the context.

Processing Question: How much did Meta spend on restructuring activities in Q4 2023 and the entire year 2023? Provide figures in millions.



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 18.72 seconds.
Result: Meta spent $738 million on restructuring activities in Q4 2023 and $1,994 million for the entire year 2023.

Processing Question: What were the year-over-year trends in ad impressions and the average price per ad for Q4 2023 and the entire year?



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 22.253 seconds.
Result: Ad impressions increased by 28% year-over-year, while the average price per ad decreased by 9% for both Q4 2023 and the full year.

Processing Question: What is Meta's revenue forecast for Q1 2024? Provide specific ranges if available.



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 17.866 seconds.
Result: Based on the provided context, there is no specific mention of Meta's revenue forecast for Q1 2024.

Processing Question: What were Meta's total costs and expenses for Q4 2023 and the entire FY 2023? Include year-over-year comparisons and specific figures in millions.



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 18.873 seconds.
Result: Total costs and expenses for Q4 2023 were not provided in the context.

Processing Question: As of December 31, 2023, how much cash, cash equivalents, and marketable securities did Meta report? Provide the exact figure in millions.



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 18.66 seconds.
Result: As of December 31, 2023, Meta reported $41,862 million in cash, cash equivalents, and marketable securities.

Processing Question: In 2023, where did Meta allocate its major investments? Highlight specific areas such as technology, infrastructure, or other initiatives.



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 17.163 seconds.
Result: The context does not provide specific information about where Meta allocated its major investments in 2023.

Processing Question: What was the financial performance of the Family of Apps and Reality Labs segments in Q4 2023, including revenue and operating income figures?



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 17.035 seconds.
Result: Family of Apps had a revenue of $39,040 million and an operating income of $21,030 million in Q4 2023. Reality Labs had a revenue of $1,071 million and an operating loss of $4,646 million in Q4 2023.

Processing Question: What was Meta's free cash flow for Q4 2023 and the entire year 2023?



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 19.468 seconds.
Result: For Q4 2023, Meta's free cash flow was 11500.0 million, and for the full year 2023, it was 43010.0 million.

Processing Question: What changes did Meta make to its stock repurchase program or dividends in 2024? Include details about any new authorizations, dividend policies, or other announcements.



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your min_length=50 must be inferior than your max_length=5.
Your min_length=50 must be inferior than your max_length=5.
Your min_length=50 must be inferior than your max_length=5.
Your min_length=50 must be inferior than your max_length=5.
Your min_length=50 must be inferior than your max_length=5.




[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 13.283 seconds.
Result: In 2024, Meta announced a $50 billion increase in its share repurchase authorization.

Processing Question: What risks did Meta highlight for 2024? Provide details on regulatory, operational, and market-related risks.



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 18.797 seconds.
Result: Meta highlighted risks associated with new products and changes to existing products, including metaverse efforts, as well as risks related to maintaining brand reputation, privacy, safety, security, and content review efforts. Regulatory risks included government actions restricting access to products or advertising sales. Operational risks included security breaches, managing scale, and geographically-dispersed operations. Market-related risks included competition, market conditions affecting dividend payments, and potential impact of macroeconomic conditions.

Processing Question: What factors contributed to Meta's revenue growth in Q4 2023? Include key metrics or events driving this growth.



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 18.052 seconds.
Result: Meta's revenue growth in Q4 2023 was driven by a 25% increase in revenue compared to the same period in 2022. This growth was attributed to the company's expanding community and business, as mentioned by Mark Zuckerberg, Meta's founder and CEO.

Processing Question: How did Reality Labs perform financially in 2023, and what are Meta's plans for this segment in 2024?



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 20.359 seconds.
Result: Reality Labs had an operating loss in 2023. In 2024, Meta expects operating losses for Reality Labs to increase meaningfully due to ongoing product development efforts in augmented reality/virtual reality and investments to further scale the ecosystem.


Generated Answers:
Q1: How did Meta’s workforce change by the end of 2023?
A: Meta's total headcount at the end of 2023 was 67,317, which was a decrease of 22% compared to the headcount at the end of 2022.

Q2: What is the report quarter, and when did it end?
A: The fourth quarter of 2023 is being reported, and it ended on December 31, 2023.

Q3: What were the key financial highlights this quarter (revenue, gross margin, operating expenses, operating margin, net income, and EPS)?
A: In Q4 2023, Meta reported revenue of $40,111 million, a 25% increase year-over-year. The operating margin was 41%, net income was $16,384 million, a 156% increase year-over-year, and EPS was no

## Rag Pipeline with Best Rouge Score

In [None]:
from langchain.chains import RetrievalQA
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from transformers import pipeline
import time
import torch
import evaluate

# Load ROUGE evaluation metric
rouge = evaluate.load("rouge")

# Chunking function
def chunk_text(text, chunk_size=1500, chunk_overlap=100):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_text(text)
    return [{"text": chunk, "metadata": {"section": "Unknown"}} for chunk in chunks]

# Summarize chunks
def summarize_chunks(retrieved_docs, summarization_threshold=512):
    device = 0 if torch.cuda.is_available() else -1
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)
    context = " ".join([doc.page_content for doc in retrieved_docs])
    if len(context.split()) < summarization_threshold:
        return context  # Skip summarization for short texts
    summaries = []
    for doc in retrieved_docs:
        input_length = len(doc.page_content.split())
        max_length = min(300, input_length // 2)
        min_length = max(50, input_length // 4)
        summary = summarizer(doc.page_content, max_length=max_length, min_length=min_length, do_sample=False)
        summaries.append(summary[0]['summary_text'])
    return " ".join(summaries)

# Process text into chunks
text_chunks = chunk_text(financial_highlights_cleaned)  # Use your cleaned and preprocessed text
documents = [Document(page_content=chunk["text"], metadata=chunk["metadata"]) for chunk in text_chunks]

# Initialize embeddings and vector store
model_kwargs = {"device": "cuda"} if torch.cuda.is_available() else {"device": "cpu"}
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs=model_kwargs)

vectordb = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory="chroma_db")
retriever = vectordb.as_retriever(search_kwargs={"k": 10})

# Initialize LLM
llm = OpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key="*******************************************************")

# Define RetrievalQA pipeline
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    verbose=True
)

# Define question prompts
enhanced_prompts = [
    "What was Meta's total headcount at the end of 2023, and how does it compare to the headcount at the end of 2022?",
    "Which quarter of 2023 is being reported, and on what date did the quarter end?",
    "What were Meta's Q4 2023 financial highlights, including revenue, gross margin, operating expenses, operating margin, net income, and EPS? Provide figures and year-over-year comparisons.",
    "How much did Meta spend on restructuring activities in Q4 2023 and the entire year 2023? Provide figures in millions.",
    "What were the year-over-year trends in ad impressions and the average price per ad for Q4 2023 and the entire year?",
    "What is Meta's revenue forecast for Q1 2024? Provide specific ranges or figures mentioned in the financial report.",
    "What were Meta's total costs and expenses for Q4 2023 and the entire FY 2023? Provide figures and year-over-year comparisons.",
    "As of December 31, 2023, how much cash, cash equivalents, and marketable securities did Meta report? Provide the exact figure in millions.",
    "In 2023, where did Meta allocate its major investments? Highlight specific areas such as technology, infrastructure, or other initiatives.",
    "What was the financial performance of the Family of Apps and Reality Labs segments in Q4 2023, including revenue and operating income figures?",
    "What was Meta's free cash flow for Q4 2023 and the entire year 2023?",
    "What changes did Meta make to its stock repurchase program or dividends in 2024? Include details about any new authorizations, dividend policies, or other announcements.",
    "What risks did Meta highlight for 2024? Provide details on regulatory, operational, and market-related risks.",
    "What factors contributed to Meta's revenue growth in Q4 2023? Include key metrics or events driving this growth.",
    "How did Reality Labs perform financially in 2023, and what are Meta's plans for this segment in 2024?"
]

# Generate answers
generated_answers = []
for question in enhanced_prompts:
    print(f"Processing Question: {question}\n")
    start_time = time.time()
    retrieved_docs = retriever.get_relevant_documents(question)
    if not retrieved_docs:
        print(f"No relevant documents found for question: {question}")
        continue

    context = summarize_chunks(retrieved_docs)
    refined_prompt = f"""
    Answer the question concisely based on the context:
    Context:
    {context}
    Question:
    {question}
    """
    try:
        result = qa.run(refined_prompt)
    except Exception as e:
        result = f"Error during processing: {e}"
    generated_answers.append(result)
    print(f"Processed in {round(time.time() - start_time, 3)} seconds.\nResult: {result}\n")

# Calculate ROUGE scores (if you have reference answers)
reference_answers = [
    "By the end of 2023, Meta had 67,317 employees. In 2022, Meta had 87,314 employees, so Meta had a 22.9% decrease in 2023 compared to 2022.",
    "The reported quarter is the fourth quarter of 2023. The quarter ended on December 31, 2023.",
    "In Q4 2023, Meta's revenue was $40,111 million, a 25% increase year-over-year. The operating margin was 41%, net income was $14,017 million, a 201% increase year-over-year, and EPS was $5.33, a 203% increase year-over-year",
    "Meta spent 1150.0 million on restructuring activities in Q4 2023 and 3450.0 million for the entire year 2023.",
    "For Q4 2023, ad impressions increased by 21% year-over-year, and the average price per ad increased by 2% year-over-year. For the full year 2023, ad impressions increased by 28% year-over-year, and the average price per ad decreased by 9%.",
    "Meta's revenue forecast for Q1 2024 is in the range of $34.5–37 billion.",
    "Total costs and expenses for Q4 2023 were $23,727.0 million, an 8% decrease year-over-year. For the full year 2023, total costs and expenses were $88,151.0 million, a 1% increase year-over-year.",
    "Meta reported $65,400.0 million in cash, cash equivalents, and marketable securities as of December 31, 2023.",
    "In 2023, Meta allocated major investments in servers, including both AI and non-AI hardware, and data centers as part of their new data center architecture.",
    "Family of Apps segment had a revenue of $39,040 million and Reality Labs segment had a revenue of $1,071 million in Q4 2023. Operating income figures are not provided in the context.",
    "Meta's free cash flow was $11,500.0 million for Q4 2023 and $43,010.0 million for the full year 2023.",
    "Meta initiated a quarterly dividend of $0.50 per share of outstanding common stock and announced a $50 billion increase in its share repurchase authorization in 2024.",
    "Meta highlighted risks associated with new products and changes to existing products, their metaverse efforts, regulatory risks, changes to third-party policies, acquisitions, security breaches, and scale and geographical operations.",
    "Meta's revenue growth in Q4 2023 was driven by a 25% increase in revenue compared to the same period in 2022. This growth was attributed to the continued growth of the community and business, as well as progress made in advancing AI and the metaverse.",
    "Reality Labs experienced an increase in operating losses in 2023 due to ongoing product development efforts. In 2024, Meta expects these losses to increase significantly."
]
questions = [
    "How did Meta’s workforce change by the end of 2023?",
    "What is the report quarter, and when did it end?",
    "What were the key financial highlights this quarter (revenue, gross margin, operating expenses, operating margin, net income, and EPS)?",
    "How much did Meta spend on restructuring for the whole year and Q4?",
    "What happened with Meta’s ad impressions and average price per ad in Q4 and for the whole year?",
    "What’s the revenue outlook for Q1 2024?",
    "What were Meta’s total costs and expenses for Q4 and the full year 2023?",
    "How much cash and marketable securities did Meta have on hand as of December 31, 2023?",
    "What were the main areas Meta invested in during 2023?",
    "How did the Family of Apps and Reality Labs perform in Q4 2023?",
    "How much free cash flow did Meta generate in Q4 and the full year 2023?",
    "Did Meta make any changes to its stock repurchase program or dividends for 2024?",
    "What risks did Meta highlight for 2024?",
    "What drove Meta’s revenue growth in Q4 2023?",
    "How did Reality Labs perform throughout 2023, and what’s Meta’s plan for 2024?"
]
rouge_scores = rouge.compute(predictions=generated_answers, references=reference_answers)

# Print results
print("\nGenerated Answers:")
for i, (q, ans) in enumerate(zip(questions, generated_answers)):
    print(f"Q{i + 1}: {q}\nA: {ans}\n")
print("ROUGE scores:", rouge_scores)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing Question: What was Meta's total headcount at the end of 2023, and how does it compare to the headcount at the end of 2022?



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your min_length=50 must be inferior than your max_length=42.
Your min_length=50 must be inferior than your max_length=42.
Your min_length=50 must be inferior than your max_length=42.




[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 18.227 seconds.
Result: Meta's total headcount at the end of 2023 was 67,317, which was a decrease of 22% compared to the headcount at the end of 2022.

Processing Question: Which quarter of 2023 is being reported, and on what date did the quarter end?



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your min_length=50 must be inferior than your max_length=42.
Your min_length=50 must be inferior than your max_length=42.
Your min_length=50 must be inferior than your max_length=42.




[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 20.654 seconds.
Result: The financial highlights are for the fourth quarter of 2023, which ended on December 31, 2023.

Processing Question: What were Meta's Q4 2023 financial highlights, including revenue, gross margin, operating expenses, operating margin, net income, and EPS? Provide figures and year-over-year comparisons.



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 22.621 seconds.
Result: In Q4 2023, Meta's revenue was $40,111 million, a 25% increase year-over-year. The operating margin was 41%, net income was $14,017 million, a 201% increase year-over-year, and EPS was $5.33, a 203% increase year-over-year.

Processing Question: How much did Meta spend on restructuring activities in Q4 2023 and the entire year 2023? Provide figures in millions.



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 20.785 seconds.
Result: Meta spent 1150.0 million on restructuring activities in Q4 2023 and 3450.0 million for the entire year 2023.

Processing Question: What were the year-over-year trends in ad impressions and the average price per ad for Q4 2023 and the entire year?



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your min_length=50 must be inferior than your max_length=42.
Your min_length=50 must be inferior than your max_length=42.
Your min_length=50 must be inferior than your max_length=42.




[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 21.214 seconds.
Result: For Q4 2023, ad impressions increased by 21% year-over-year, and the average price per ad increased by 2% year-over-year. For the full year 2023, ad impressions increased by 28% year-over-year, and the average price per ad decreased by 9%.

Processing Question: What is Meta's revenue forecast for Q1 2024? Provide specific ranges or figures mentioned in the financial report.



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 22.555 seconds.
Result: Meta's revenue forecast for Q1 2024 is in the range of 34.5-37 billion.

Processing Question: What were Meta's total costs and expenses for Q4 2023 and the entire FY 2023? Provide figures and year-over-year comparisons.



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 21.24 seconds.
Result: Total costs and expenses for Meta were 23,727.0 million for Q4 2023, which was an 8% decrease year-over-year. For the full year 2023, total costs and expenses were 88,151.0 million, representing a 1% increase year-over-year.

Processing Question: As of December 31, 2023, how much cash, cash equivalents, and marketable securities did Meta report? Provide the exact figure in millions.



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your min_length=50 must be inferior than your max_length=42.
Your min_length=50 must be inferior than your max_length=42.
Your min_length=50 must be inferior than your max_length=42.




[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 18.808 seconds.
Result: Meta reported 65400.0 million in cash, cash equivalents, and marketable securities as of December 31, 2023.

Processing Question: In 2023, where did Meta allocate its major investments? Highlight specific areas such as technology, infrastructure, or other initiatives.



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your min_length=50 must be inferior than your max_length=42.
Your min_length=50 must be inferior than your max_length=42.
Your min_length=50 must be inferior than your max_length=42.




[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 18.007 seconds.
Result: In 2023, Meta allocated major investments in servers, including AI and non-AI hardware, and data centers as part of their capital expenditures.

Processing Question: What was the financial performance of the Family of Apps and Reality Labs segments in Q4 2023, including revenue and operating income figures?



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your min_length=50 must be inferior than your max_length=42.




[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 22.992 seconds.
Result: The financial performance of the Family of Apps and Reality Labs segments in Q4 2023 included revenue of $40,111 million and operating income of $16,384 million.

Processing Question: What was Meta's free cash flow for Q4 2023 and the entire year 2023?



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your min_length=50 must be inferior than your max_length=42.




[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 22.016 seconds.
Result: Meta's free cash flow for Q4 2023 was 11500.0 million, and for the entire year 2023, it was 43010.0 million.

Processing Question: What changes did Meta make to its stock repurchase program or dividends in 2024? Include details about any new authorizations, dividend policies, or other announcements.



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 18.98 seconds.
Result: Meta initiated a quarterly cash dividend of $0.50 per share of outstanding common stock in 2024. Additionally, they announced a $50 billion increase in their share repurchase authorization.

Processing Question: What risks did Meta highlight for 2024? Provide details on regulatory, operational, and market-related risks.



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your min_length=50 must be inferior than your max_length=42.




[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 18.925 seconds.
Result: Meta highlighted risks associated with new products and changes to existing products, including their metaverse efforts, as well as risks related to mobile operating systems, networks, and standards that they do not control. They also mentioned risks from changes to third-party policies impacting advertising practices, acquisitions, security breaches, and managing their scale and geographically-dispersed operations. Additionally, they noted regulatory risks, specifically mentioning the Federal Trade Commission seeking to modify their existing consent order, which could have an adverse impact on their business. Finally, they highlighted the importance of maintaining operating discipline and improving advertising performance in the face of market conditions.

Processing Question: What factors contributed to Meta's revenue growth in Q4 2023? Include key metrics or events driving this growth.



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 20.656 seconds.
Result: Ad impressions increased by 28% year-over-year, and the average price per ad decreased by 9% year-over-year, contributing to Meta's revenue growth in Q4 2023.

Processing Question: How did Reality Labs perform financially in 2023, and what are Meta's plans for this segment in 2024?



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m
Processed in 21.97 seconds.
Result: Reality Labs had operating losses that increased meaningfully year-over-year in 2023 due to ongoing product development efforts in augmented reality/virtual reality. Meta expects these losses to continue in 2024 as they invest in scaling their ecosystem and AI research.


Generated Answers:
Q1: How did Meta’s workforce change by the end of 2023?
A: Meta's total headcount at the end of 2023 was 67,317, which was a decrease of 22% compared to the headcount at the end of 2022.

Q2: What is the report quarter, and when did it end?
A: The financial highlights are for the fourth quarter of 2023, which ended on December 31, 2023.

Q3: What were the key financial highlights this quarter (revenue, gross margin, operating expenses, operating margin, net income, and EPS)?
A: In Q4 2023, Meta's revenue was $40,111 million, a 25% increase year-over-year. The operating margin was 41%, net income was $14,017 million, a 201% increase year-

rouge1: 0.761 - This indicates that about 76.2% of the words in the generated answers align with the reference answers.

rouge2: 0.648 - About 64.8% of the bigrams in the generated answers match those in the reference answers.

rougeL: 0.716 - Approximately 71.6% of the longest word sequences are shared between the generated and reference answers.

rougeLsum: 0.713 - About 71.3% structural similarity between the summaries or answers.