In [None]:
%%capture
!pip install llama-index
!pip install llama-index-core
!pip install llama-index-embeddings-openai
!pip install llama-index-postprocessor-flag-embedding-reranker
!pip install git+https://github.com/FlagOpen/FlagEmbedding.git
!pip install llama-parse
!pip install llama-index-embeddings-fastembed
!pip install fastembed
!pip install llama-index-llms-groq

In [None]:

# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
import nest_asyncio
nest_asyncio.apply()

# API access to llama-cloud
#LLAMA_CLOUD_API_KEY = "llx-*****"

# Using OpenAI API for embeddings/llms
#GROQ_API_KEY = "gsk_*****"

Using brand new LlamaParse PDF reader for PDF Parsing we also compare two different retrieval/query engine strategies:

Using raw Markdown text as nodes for building index and apply simple query engine for generating the results; Using MarkdownElementNodeParser for parsing the LlamaParse output Markdown results and building recursive retriever query engine for generation.

In [None]:
import os
import pickle
from llama_parse import LlamaParse

# Replace <YOUR_API_KEY> with the actual key
LLAMA_CLOUD_API_KEY = os.environ.get('LLAMA_CLOUD_API_KEY', '<YOUR_API_KEY>')

# Verify the API key
if not LLAMA_CLOUD_API_KEY:
    raise ValueError("Missing API Key for Llama Cloud. Set LLAMA_CLOUD_API_KEY in environment variables.")

# Define parsing function
def load_or_parse_data():
    data_file = "./parsed_data.pkl"
    file_path = "/content/Mastercard_2022_Report.pdf"

    # Check if file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    # Load from cache if available
    if os.path.exists(data_file):
        with open(data_file, "rb") as f:
            parsed_data = pickle.load(f)
    else:
        # Parse the document
        parsing_instruction = """
        The provided document is a quarterly financial report. It includes unaudited financial statements,
        management analysis, and disclosures. Focus on accuracy while extracting data tables and insights.
        """
        parser = LlamaParse(
            api_key=LLAMA_CLOUD_API_KEY,
            result_type="markdown",
            parsing_instruction=parsing_instruction
        )

        # Parse data
        try:
            parsed_data = parser.load_data(file_path)
        except Exception as e:
            raise RuntimeError(f"Parsing failed: {str(e)}")

        # Cache the parsed data
        with open(data_file, "wb") as f:
            pickle.dump(parsed_data, f)

    return parsed_data

# Execute the parsing function
try:
    documents = load_or_parse_data()
    print("Parsing successful!")
except Exception as e:
    print(f"Error: {e}")


Parsing successful!


In [None]:
import os

file_path = "/content/Mastercard_2022_Report.pdf"
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found at: {file_path}")
else:
    print("File exists:", os.path.getsize(file_path), "bytes")


File exists: 3023098 bytes


In [None]:
sample_text = "This is a sample document for testing purposes."
with open("/content/test.txt", "w") as f:
    f.write(sample_text)


In [None]:
!pip install PyMuPDF




In [None]:
import fitz  # PyMuPDF
print("PyMuPDF version:", fitz.__doc__)


PyMuPDF version: PyMuPDF 1.25.1: Python bindings for the MuPDF 1.25.2 library (rebased implementation).
Python 3.10 running on linux (64-bit).



In [None]:
import fitz  # PyMuPDF

file_path = "/content/Mastercard_2022_Report.pdf"

# Open and parse the PDF
with fitz.open(file_path) as doc:
    text = ""
    for page in doc:
        text += page.get_text()

print(text[:500])  # Display the first 500 characters


UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
Form 10-K
☒
ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the fiscal year ended December 31, 2022 
Or
☐
TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the transition period from              to             
Commission file number: 001-32877 
Mastercard Incorporated
(Exact name of registrant as specified in its charter)
Delaware
13-417255


In [None]:

import requests
response = requests.get("https://api.groq.com/")
print(response.status_code)


200


In [None]:
!pip install llama-index
!pip install fastembed




In [None]:
import os
from llama_index.core import VectorStoreIndex
from llama_index.llms.groq import Groq
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.core import Settings

# Use environment variable or hardcoded key
os.environ['GROQ_API_KEY'] = '<YOUR_GROQ_API_KEY>'  # Replace with your key
GROQ_API_KEY = os.getenv('GROQ_API_KEY')

# Initialize embedding model and LLM
embed_model = FastEmbedEmbedding(model_name="BAAI/bge-base-en-v1.5")
llm = Groq(model="mixtral-8x7b-32768", api_key=GROQ_API_KEY)

# Apply settings
Settings.llm = llm
Settings.embed_model = embed_model


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from llama_index.core.node_parser import MarkdownElementNodeParser

node_parser = MarkdownElementNodeParser(llm = Groq(model="mixtral-8x7b-32768", api_key= GROQ_API_KEY), num_workers=8)



In [None]:

nodes = node_parser.get_nodes_from_documents(documents)

In [None]:
import os
from llama_index.core.node_parser import MarkdownElementNodeParser
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import Settings

# Set API Key (replace '<YOUR_OPENAI_API_KEY>' with your actual key)
os.environ['OPENAI_API_KEY'] = '<YOUR_OPENAI_API_KEY>'
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Initialize embedding model and LLM
embed_model = OpenAIEmbedding(model="text-embedding-3-small", api_key=OPENAI_API_KEY)
llm = OpenAI(model="gpt-3.5-turbo-0125", api_key=OPENAI_API_KEY)

# Apply settings
Settings.embed_model = embed_model
Settings.llm = llm

# Initialize the node parser
node_parser = MarkdownElementNodeParser(llm=llm, num_workers=8)

print("Setup completed successfully!")


Setup completed successfully!


In [None]:

nodes = node_parser.get_nodes_from_documents(documents, progress =True)

In [None]:

llm

OpenAI(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7e6450956aa0>, system_prompt=None, messages_to_prompt=<function messages_to_prompt at 0x7e6426228820>, completion_to_prompt=<function default_completion_to_prompt at 0x7e64262a31c0>, output_parser=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>, query_wrapper_prompt=None, model='gpt-3.5-turbo-0125', temperature=0.1, max_tokens=None, logprobs=None, top_logprobs=0, additional_kwargs={}, max_retries=3, timeout=60.0, default_headers=None, reuse_client=True, api_key='<YOUR_OPENAI_API_KEY>', api_base='https://api.openai.com/v1', api_version='', strict=False)

In [None]:
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

In [None]:


recursive_index = VectorStoreIndex(nodes=base_nodes+objects)
raw_index = VectorStoreIndex.from_documents(documents)

In [None]:
!pip install git+https://github.com/FlagOpen/FlagEmbedding.git


Collecting git+https://github.com/FlagOpen/FlagEmbedding.git
  Cloning https://github.com/FlagOpen/FlagEmbedding.git to /tmp/pip-req-build-ppheusmx
  Running command git clone --filter=blob:none --quiet https://github.com/FlagOpen/FlagEmbedding.git /tmp/pip-req-build-ppheusmx
  Resolved https://github.com/FlagOpen/FlagEmbedding.git to commit 049882837fe3cffaa47d07fd33f153d5cfca6050
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
!pip install air-benchmark




In [None]:
import air_benchmark
print("air_benchmark successfully installed!")


air_benchmark successfully installed!


In [None]:
!pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m62.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [None]:
!pip install git+https://github.com/FlagOpen/FlagEmbedding.git


Collecting git+https://github.com/FlagOpen/FlagEmbedding.git
  Cloning https://github.com/FlagOpen/FlagEmbedding.git to /tmp/pip-req-build-ryhs4nsf
  Running command git clone --filter=blob:none --quiet https://github.com/FlagOpen/FlagEmbedding.git /tmp/pip-req-build-ryhs4nsf
  Resolved https://github.com/FlagOpen/FlagEmbedding.git to commit 049882837fe3cffaa47d07fd33f153d5cfca6050
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
!pip install faiss-cpu
!pip install llama-index
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip install git+https://github.com/FlagOpen/FlagEmbedding.git


Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting git+https://github.com/FlagOpen/FlagEmbedding.git
  Cloning https://github.com/FlagOpen/FlagEmbedding.git to /tmp/pip-req-build-znj787kp
  Running command git clone --filter=blob:none --quiet https://github.com/FlagOpen/FlagEmbedding.git /tmp/pip-req-build-znj787kp
  Resolved https://github.com/FlagOpen/FlagEmbedding.git to commit 049882837fe3cffaa47d07fd33f153d5cfca6050
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
!pip install beir
!pip install git+https://github.com/FlagOpen/FlagEmbedding.git


Collecting beir
  Downloading beir-2.0.0.tar.gz (53 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting elasticsearch==7.9.1 (from beir)
  Downloading elasticsearch-7.9.1-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading elasticsearch-7.9.1-py2.py3-none-any.whl (219 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.2/219.2 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: beir
  Building wheel for beir (setup.py) ... [?25l[?25hdone
  Created wheel for beir: filename=beir-2.0.0-py3-none-any.whl size=63550 sha256=cd1904495c9325a0869db3dcf34536a560516feda728c4b1f001e53daf2af5f9
  Stored in directory: /root/.cache/pip/wheels/1c/14/96/c606ede3c10e9300ef771a6183af09d389459195f

In [None]:
!pip install faiss-cpu
!pip install llama-index
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu


Looking in indexes: https://download.pytorch.org/whl/cpu


In [None]:
!pip freeze


absl-py==1.4.0
accelerate==1.2.1
aiohappyeyeballs==2.4.4
aiohttp==3.11.10
aiosignal==1.3.2
air-benchmark==0.1.0
alabaster==1.0.0
albucore==0.0.19
albumentations==1.4.20
altair==5.5.0
annotated-types==0.7.0
anyio==3.7.1
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
array_record==0.5.1
arviz==0.20.0
astropy==6.1.7
astropy-iers-data==0.2024.12.16.0.35.48
astunparse==1.6.3
async-timeout==4.0.3
atpublic==4.1.0
attrs==24.3.0
audioread==3.0.1
autograd==1.7.0
babel==2.16.0
backcall==0.2.0
beautifulsoup4==4.12.3
beir==2.0.0
bigframes==1.29.0
bigquery-magics==0.4.0
bleach==6.2.0
blinker==1.9.0
blis==0.7.11
blosc2==2.7.1
bokeh==3.6.2
Bottleneck==1.4.2
bqplot==0.12.43
branca==0.8.1
CacheControl==0.14.1
cachetools==5.5.0
catalogue==2.0.10
cbor==1.0.0
certifi==2024.12.14
cffi==1.17.1
chardet==5.2.0
charset-normalizer==3.4.0
chex==0.1.88
clarabel==0.9.0
click==8.1.7
cloudpathlib==0.20.0
cloudpickle==3.1.0
cmake==3.31.2
cmdstanpy==1.2.5
colorcet==3.1.0
coloredlogs==15.0.1
colorlover==0.3.0
colour==

In [None]:
import pkg_resources
print(pkg_resources.get_distribution("beir").version)


2.0.0


  import pkg_resources


In [None]:
import importlib.metadata
print(importlib.metadata.version("beir"))


2.0.0


In [None]:
!pip install beir




In [None]:
!pip freeze | grep beir


beir==2.0.0


In [None]:
import beir
print("BEIR module is successfully imported!")


BEIR module is successfully imported!


In [None]:
!pip install mteb
!pip install beir
!pip install faiss-cpu
!pip install git+https://github.com/FlagOpen/FlagEmbedding.git
!pip install llama-index


Collecting git+https://github.com/FlagOpen/FlagEmbedding.git
  Cloning https://github.com/FlagOpen/FlagEmbedding.git to /tmp/pip-req-build-iu1p1crh
  Running command git clone --filter=blob:none --quiet https://github.com/FlagOpen/FlagEmbedding.git /tmp/pip-req-build-iu1p1crh
  Resolved https://github.com/FlagOpen/FlagEmbedding.git to commit 049882837fe3cffaa47d07fd33f153d5cfca6050
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
!mkdir -p data


In [None]:
import os

dir_path = "data/"
print(f"Directory exists: {os.path.exists(dir_path)}")
print("Files in directory:", os.listdir(dir_path))


Directory exists: True
Files in directory: []


In [None]:
import shutil

# Move the file to the 'data/' folder
shutil.move("Mastercard_2022_Report.pdf", "data/Mastercard_2022_Report.pdf")

# Verify files in the 'data/' folder
import os
print("Files in data/:", os.listdir("data/"))


Files in data/: ['Mastercard_2022_Report.pdf']


In [None]:
import os
print("Current working directory:", os.getcwd())


Current working directory: /content


In [None]:
import os

print("Current working directory:", os.getcwd())
print("Files in current directory:", os.listdir("."))


Current working directory: /content
Files in current directory: ['.config', 'parsed_data.pkl', 'test.txt', 'data', 'sample_data']


In [None]:
!find /content -name "Mastercard_2022_Report.pdf"


/content/data/Mastercard_2022_Report.pdf


In [None]:
!mv "/content/Mastercard_2022_Report.pdf" "/content/"


mv: cannot stat '/content/Mastercard_2022_Report.pdf': No such file or directory


In [None]:
from llama_index.core import SimpleDirectoryReader

# Correct path to the file
file_path = "/content/data/Mastercard_2022_Report.pdf"

# Verify path exists
import os
print("File exists:", os.path.exists(file_path))

# Load the document
documents = SimpleDirectoryReader(input_files=[file_path]).load_data()
print(f"Loaded {len(documents)} documents.")


File exists: True
Loaded 132 documents.


In [None]:
!pip install llama-index
!pip install faiss-cpu
!pip install git+https://github.com/FlagOpen/FlagEmbedding.git
!pip install beir
!pip install mteb


Collecting git+https://github.com/FlagOpen/FlagEmbedding.git
  Cloning https://github.com/FlagOpen/FlagEmbedding.git to /tmp/pip-req-build-90e1ubx9
  Running command git clone --filter=blob:none --quiet https://github.com/FlagOpen/FlagEmbedding.git /tmp/pip-req-build-90e1ubx9
  Resolved https://github.com/FlagOpen/FlagEmbedding.git to commit 049882837fe3cffaa47d07fd33f153d5cfca6050
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:

from llama_index.core import VectorStoreIndex
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings


In [None]:
import os

# Set OpenAI API key
os.environ['OPENAI_API_KEY'] = '<YOUR_OPENAI_API_KEY>'
OPENAI_API_KEY = os.getenv('OpenAI_API_Key_dec')

# Initialize embedding and LLM
embed_model = OpenAIEmbedding(model="text-embedding-3-small", api_key=OPENAI_API_KEY)
llm = OpenAI(model="gpt-3.5-turbo-0125", api_key=OPENAI_API_KEY)

# Apply Settings
Settings.embed_model = embed_model
Settings.llm = llm


In [None]:
import os

# Replace this with your actual OpenAI API Key
os.environ['OPENAI_API_KEY'] = 'sk-proj-##'  # Replace with your key

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
print(f"Using OpenAI API Key: {OPENAI_API_KEY[:5]}...{OPENAI_API_KEY[-5:]}")  # Masked for security


Using OpenAI API Key: sk-pr...W3ZIA


In [None]:
import requests

headers = {"Authorization": f"Bearer {OPENAI_API_KEY}"}
response = requests.get("https://api.openai.com/v1/models", headers=headers)

print("Status code:", response.status_code)
print("Response:", response.json())


Status code: 200
Response: {'object': 'list', 'data': [{'id': 'gpt-4o-audio-preview-2024-10-01', 'object': 'model', 'created': 1727389042, 'owned_by': 'system'}, {'id': 'gpt-4o-realtime-preview', 'object': 'model', 'created': 1727659998, 'owned_by': 'system'}, {'id': 'gpt-4o-realtime-preview-2024-10-01', 'object': 'model', 'created': 1727131766, 'owned_by': 'system'}, {'id': 'dall-e-2', 'object': 'model', 'created': 1698798177, 'owned_by': 'system'}, {'id': 'gpt-4o-2024-08-06', 'object': 'model', 'created': 1722814719, 'owned_by': 'system'}, {'id': 'gpt-4-turbo', 'object': 'model', 'created': 1712361441, 'owned_by': 'system'}, {'id': 'gpt-4-1106-preview', 'object': 'model', 'created': 1698957206, 'owned_by': 'system'}, {'id': 'gpt-4o', 'object': 'model', 'created': 1715367049, 'owned_by': 'system'}, {'id': 'gpt-3.5-turbo', 'object': 'model', 'created': 1677610602, 'owned_by': 'openai'}, {'id': 'gpt-3.5-turbo-0125', 'object': 'model', 'created': 1706048358, 'owned_by': 'system'}, {'id':

In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

# Initialize embeddings and LLM
embed_model = OpenAIEmbedding(model="text-embedding-3-small", api_key=OPENAI_API_KEY)
llm = OpenAI(model="gpt-3.5-turbo-0125", api_key=OPENAI_API_KEY)

Settings.embed_model = embed_model
Settings.llm = llm

# Create indices
recursive_index = VectorStoreIndex.from_documents(documents)
raw_index = VectorStoreIndex.from_documents(documents)

print("Indices created successfully!")


Indices created successfully!


In [None]:
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

reranker = FlagEmbeddingReranker(
    top_n=5,
    model="BAAI/bge-reranker-large",
)

recursive_query_engine = recursive_index.as_query_engine(
    similarity_top_k=15,
    node_postprocessors=[reranker],
    verbose=True
)

raw_query_engine = raw_index.as_query_engine(similarity_top_k=15, node_postprocessors=[reranker])


In [None]:
from llama_index.core.node_parser import SimpleNodeParser

# Assuming 'documents' are already loaded
parser = SimpleNodeParser()

# Process the documents into nodes
nodes = parser.get_nodes_from_documents(documents)

print(f"Number of nodes: {len(nodes)}")


Number of nodes: 225


In [None]:
print(len(nodes))


225


Using new LlamaParse as pdf data parsing methods and retrieve tables with two different methods we compare base query engine vs recursive query engine with tables

Table Query Task: Queries for Table Question Answering

In [None]:

query = "What is the average price paid per share in 2022 and 2021?"

response_1 = raw_query_engine.query(query)
print("\n***********New LlamaParse+ Basic Query Engine***********")
print(response_1)

response_2 = recursive_query_engine.query(query)
print("\n***********New LlamaParse+ Recursive Retriever Query Engine***********")
print(response_2)

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



***********New LlamaParse+ Basic Query Engine***********
The average price paid per share in 2022 was $340.60, and in 2021 it was $356.82.

***********New LlamaParse+ Recursive Retriever Query Engine***********
The average price paid per share in 2022 was $340.60, and in 2021 it was $356.82.


In [None]:
query = "What is the total dividend declared in 2022 and 2021"

response_1 = raw_query_engine.query(query)
print("\n***********New LlamaParse+ Basic Query Engine***********")
print(response_1)

response_2 = recursive_query_engine.query(query)
print("\n***********New LlamaParse+ Recursive Retriever Query Engine***********")
print(response_2)


***********New LlamaParse+ Basic Query Engine***********
The total dividend declared in 2022 was $1.09 billion, and in 2021 it was $1.088 billion.

***********New LlamaParse+ Recursive Retriever Query Engine***********
The total dividend declared in 2022 and 2021 was $545 million in 2022 and $543 million in 2021.


In [None]:
#Earning per share basic in 2022 and 2021 - $10.26 and $8.79
query = "What is the Earning per share basic in 2022 and 2021"

response_1 = raw_query_engine.query(query)
print("\n***********New LlamaParse+ Basic Query Engine***********")
print(response_1)

response_2 = recursive_query_engine.query(query)
print("\n***********New LlamaParse+ Recursive Retriever Query Engine***********")
print(response_2)


***********New LlamaParse+ Basic Query Engine***********
The Earnings per Share (EPS) basic in 2022 was $10.26, and in 2021 it was $8.79.

***********New LlamaParse+ Recursive Retriever Query Engine***********
The Earnings per Share (EPS) basic in 2022 was $10.26, and in 2021 it was $8.79.
