In [1]:
# Import libraries
import boto3
import faiss
import json

import pymupdf
import requests
import os
import logging
import numpy as np
import warnings
from tqdm import tqdm
from botocore.exceptions import ClientError
from langchain_text_splitters import RecursiveCharacterTextSplitter
from IPython import display
from functions import processing, model


logger = logging.getLogger(__name__)
logger.setLevel(logging.ERROR)

warnings.filterwarnings("ignore")

# Downloading data

In [2]:
url  = "https://www.standardbank.co.za/file_source/South%20Africa/PDF/Personal%20Pricing/2025/ACHIEVA_Bundled_Account_Pricing_Guide_2025.pdf"

In [3]:
# Set filename and path
filename = "ACHIEVA_Bundled_Account_Pricing_Guide_2025.pdf"
filepath = os.path.join("data", filename)

In [4]:

# Creat file directory if it doesn't exist
os.makedirs("data", exist_ok=True)

# Download the file
response = requests.get(url)
if response.status_code == 200:
    with open(filepath,'wb') as file:
        file.write(response.content)
    print(f"File downloaded succesfully: {filepath}")
else:
    print("Download failed. Staus code: {response.status_code}")

File downloaded succesfully: data/ACHIEVA_Bundled_Account_Pricing_Guide_2025.pdf


In [5]:
doc = pymupdf.open(filepath)
num_pages = len(doc)
base_dir = "data"

# Creating the directories

processing.create_directories(base_dir)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=200, length_function=len)
items = []

# Process each page of the Pdf

for page_num in tqdm(range(num_pages), desc = 'Processing pages'):
    page = doc[page_num]
    text = page.get_text()
    processing.process_tables(filepath,doc,page_num, base_dir,items)
    processing.process_text_chunks(filepath,text, text_splitter, page_num, base_dir, items)
    processing.process_images(filepath,doc, page, page_num, base_dir, items)
    processing.process_page_images(page, page_num, base_dir, items)

Processing pages: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14/14 [00:05<00:00,  2.39it/s]


In [6]:
# Looking at the first text item
[i for i in items if i['type'] == 'text'][0]

{'page': 0,
 'type': 'text',
 'text': '2025 pricing\nYour\nACHIEVA\nTM\nAccount',
 'path': 'data/text/ACHIEVA_Bundled_Account_Pricing_Guide_2025.pdf_text_0_0.txt'}

In [7]:
[i for i in items if i['type'] == 'table'][0]

{'page': 1,
 'type': 'table',
 'text': 'WWhhaatt  yyoouu  ggeett: nan, ',
 'path': 'data/tables/ACHIEVA_Bundled_Account_Pricing_Guide_2025.pdf_table_1_0.txt'}

# Generating Multimodal Embeddings

In [8]:
# Set embedding vector dimension
embedding_vector_dimension = 384

# Coun the number of each type of item

item_counts = {
    'text': sum(1 for item in items if item['type'] == 'text'),
    'table': sum(1 for item in items if item['type'] == 'table'),
    'image': sum(1 for item in items if item['type'] == 'image'),
    'page': sum(1 for item in items if item['type'] == 'page')
}    
    


In [9]:
item_counts


{'text': 33, 'table': 22, 'image': 9, 'page': 14}

In [10]:
# Initialize the counters
counters = dict.fromkeys(item_counts.keys(),0)

In [11]:
# Generate embeddings for all items
with tqdm(
    total=len(items),
    desc="Generating embeddings",
    bar_format=(
        "{l_bar}{bar}| {n_fmt}/{total_fmt} "
        "[{elapsed}<{remaining}, {rate_fmt}{postfix}]"
    )
) as pbar:
    
    for item in items:
        item_type = item['type']
        counters[item_type] += 1
        
        if item_type in ['text', 'table']:
            # For text or table, use the formatted text representation
            item['embedding'] = model.generate_multimodal_embeddings(prompt=item['text'],output_embedding_length=embedding_vector_dimension) 
        else:
            # For images, use the base64-encoded image data
            item['embedding'] = model.generate_multimodal_embeddings(image=item['image'], output_embedding_length=embedding_vector_dimension)
        
        # Update the progress bar
        pbar.set_postfix_str(f"Text: {counters['text']}/{item_counts['text']}, Table: {counters['table']}/{item_counts['table']}, Image: {counters['image']}/{item_counts['image']}")
        pbar.update(1)

Generating embeddings: 100%|█████████████████████████████████████████████████████████████████████| 78/78 [00:22<00:00,  3.42it/s, Text: 33/33, Table: 22/22, Image: 9/9]


# Create vector database/index

In [12]:
all_embeddings = np.array([item['embedding'] for item in items],dtype=np.float32)

# Create FAISS Index
index = faiss.IndexFlatL2(embedding_vector_dimension)

# Clear any pre-existing index
index.reset()

# Add embeddings to the index
index.add(all_embeddings)

# Test the RAG pipeline

In [13]:
query = 'What is the monthly administration fee of the ACHIEVA account?'

# Generate embeddings for the query
query_embedding = model.generate_multimodal_embeddings(prompt=query,output_embedding_length=embedding_vector_dimension)

# Search for the nearest neighbors in the vector database
distances, result = index.search(np.array(query_embedding, dtype=np.float32).reshape(1,-1), k=5)

In [14]:
# Check the result (matched chunks)
result.flatten()

array([ 0,  6,  7, 62, 22])

In [15]:
# Retrieve the matched items
matched_items = [{k: v for k, v in items[index].items() if k != 'embedding'} for index in result.flatten()]

# Generate RAG response with Amazon Nova
response = model.invoke_nova_multimodal(prompt=query, matched_items=matched_items)

In [16]:
# Display the response
display.Markdown(response)

The text context provided does not explicitly state the monthly administration fee for the ACHIEVA Account. It mentions various features, perks, and additional costs associated with the account, such as the cost of personalized cards and notification services, but the specific monthly administration fee is not detailed in the provided excerpt. 

To find the exact monthly administration fee, you would need to refer to the complete terms and conditions or contact Standard Bank directly for the most accurate and up-to-date information.

In [17]:
query = 'List the ATM withdrawal fees for this account'

# Generate embeddings for the query
query_embedding = model.generate_multimodal_embeddings(prompt=query,output_embedding_length=embedding_vector_dimension)

# Search for the nearest neighbors in the vector database
distances, result = index.search(np.array(query_embedding, dtype=np.float32).reshape(1,-1), k=5)

# Retrieve the matched items
matched_items = [{k: v for k, v in items[index].items() if k != 'embedding'} for index in result.flatten()]

# Generate RAG response with Amazon Nova
response = model.invoke_nova_multimodal(prompt=query, matched_items=matched_items)

# Display the response
display.Markdown(response)

Sure, here are the ATM withdrawal fees for the account based on the provided information:

1. **Standard Bank ATM Withdrawals:**
   - R2.65 per R100 or part thereof.

2. **Other Bank ATM Withdrawals:**
   - R2.65 per R100 or part thereof.

3. **International ATM Withdrawals:**
   - R3 per R100 or part thereof (minimum R70) 
   - Plus an international transaction fee of 2.75%.

4. **Coin Withdrawal at ATM:**
   - Not available (indicated by –).

5. **Notes and Coin Withdrawal at ATM:**
   - Not available (indicated by –).

6. **Cash for Cash (Change) at ATM:**
   - Not available (indicated by –).