In [1]:
# Import libraries
import boto3
import faiss
import json

import pymupdf
import requests
import os
import logging
import numpy as np
import warnings
from tqdm import tqdm
from botocore.exceptions import ClientError
from langchain_text_splitters import RecursiveCharacterTextSplitter
from IPython import display
from functions import processing, model


logger = logging.getLogger(__name__)
logger.setLevel(logging.ERROR)

warnings.filterwarnings("ignore")

# Downloading data

In [3]:
url  = "https://www.standardbank.co.za/file_source/South%20Africa/PDF/Personal%20Pricing/2025/ACHIEVA_Bundled_Account_Pricing_Guide_2025.pdf"

In [4]:
# Set filename and path
filename = "ACHIEVA_Bundled_Account_Pricing_Guide_2025.pdf"
filepath = os.path.join("data", filename)

In [4]:

# Creat file directory if it doesn't exist
os.makedirs("data", exist_ok=True)

# Download the file
response = requests.get(url)
if response.status_code == 200:
    with open(filepath,'wb') as file:
        file.write(response.content)
    print(f"File downloaded succesfully: {filepath}")
else:
    print("Download failed. Staus code: {response.status_code}")

File downloaded succesfully: data/ACHIEVA_Bundled_Account_Pricing_Guide_2025.pdf


In [5]:
doc = pymupdf.open(filepath)
num_pages = len(doc)
base_dir = "data"

# Creating the directories

processing.create_directories(base_dir)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=200, length_function=len)
items = []

# Process each page of the Pdf

for page_num in tqdm(range(num_pages), desc = 'Processing pages'):
    page = doc[page_num]
    text = page.get_text()
    processing.process_tables(filepath,doc,page_num, base_dir,items)
    processing.process_text_chunks(filepath,text, text_splitter, page_num, base_dir, items)
    processing.process_images(filepath,doc, page, page_num, base_dir, items)
    processing.process_page_images(page, page_num, base_dir, items)

Processing pages: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14/14 [00:04<00:00,  3.03it/s]


In [6]:
# Looking at the first text item
[i for i in items if i['type'] == 'text'][0]

{'page': 0,
 'type': 'text',
 'text': '2025 pricing\nYour\nACHIEVA\nTM\nAccount',
 'path': 'data/text/ACHIEVA_Bundled_Account_Pricing_Guide_2025.pdf_text_0_0.txt'}

In [7]:
[i for i in items if i['type'] == 'table'][0]

{'page': 1,
 'type': 'table',
 'text': 'WWhhaatt  yyoouu  ggeett: nan, ',
 'path': 'data/tables/ACHIEVA_Bundled_Account_Pricing_Guide_2025.pdf_table_1_0.txt'}

# Generating Multimodal Embeddings

In [8]:
# Set embedding vector dimension
embedding_vector_dimension = 384

# Coun the number of each type of item

item_counts = {
    'text': sum(1 for item in items if item['type'] == 'text'),
    'table': sum(1 for item in items if item['type'] == 'table'),
    'image': sum(1 for item in items if item['type'] == 'image'),
    'page': sum(1 for item in items if item['type'] == 'page')
}    
    


In [9]:
item_counts


{'text': 33, 'table': 22, 'image': 9, 'page': 14}

In [10]:
# Initialize the counters
counters = dict.fromkeys(item_counts.keys(),0)

In [11]:
# Generate embeddings for all items
with tqdm(
    total=len(items),
    desc="Generating embeddings",
    bar_format=(
        "{l_bar}{bar}| {n_fmt}/{total_fmt} "
        "[{elapsed}<{remaining}, {rate_fmt}{postfix}]"
    )
) as pbar:
    
    for item in items:
        item_type = item['type']
        counters[item_type] += 1
        
        if item_type in ['text', 'table']:
            # For text or table, use the formatted text representation
            item['embedding'] = model.generate_multimodal_embeddings(prompt=item['text'],output_embedding_length=embedding_vector_dimension) 
        else:
            # For images, use the base64-encoded image data
            item['embedding'] = model.generate_multimodal_embeddings(image=item['image'], output_embedding_length=embedding_vector_dimension)
        
        # Update the progress bar
        pbar.set_postfix_str(f"Text: {counters['text']}/{item_counts['text']}, Table: {counters['table']}/{item_counts['table']}, Image: {counters['image']}/{item_counts['image']}")
        pbar.update(1)

Generating embeddings: 100%|█████████████████████████████████████████████████████████████████████| 78/78 [00:24<00:00,  3.17it/s, Text: 33/33, Table: 22/22, Image: 9/9]


# Create vector database/index

In [None]:
all_embeddings = np.array([item['embedding'] for item in items],dtype=np.float32)

# Create FAISS Index
index = faiss.IndexFlatL2(embedding_vector_dimension)

# Clear any pre-existing index
index.reset()

# Add embeddings to the index
index.add(all_embeddings)

# Test the RAG pipeline

In [18]:
query = 'What is the monthly administration fee of the ACHIEVA account?'

# Generate embeddings for the query
query_embedding = model.generate_multimodal_embeddings(prompt=query,output_embedding_length=embedding_vector_dimension)

# Search for the nearest neighbors in the vector database
distances, result = index.search(np.array(query_embedding, dtype=np.float32).reshape(1,-1), k=5)

In [19]:
# Check the result (matched chunks)
result.flatten()

array([ 0,  6,  7, 62, 22])

In [20]:
# Retrieve the matched items
matched_items = [{k: v for k, v in items[index].items() if k != 'embedding'} for index in result.flatten()]

# Generate RAG response with Amazon Nova
response = invoke_nova_multimodal(prompt=query, matched_items=matched_items)

In [21]:
# Display the response
display.Markdown(response)

The document does not explicitly state the monthly administration fee for the ACHIEVA Account. However, it mentions that the monthly fee will remain unchanged starting from 1 January 2025. To find the exact monthly administration fee, you would need to refer to the current pricing details or contact Standard Bank directly. 

Here’s a summary of the costs mentioned:
- **Personalised ACHIEVA Gold Cheque Card**: R115 per month
- **Personalised ACHIEVA Gold Credit Card**: An extra R63 per month
- **MyUpdates (additional email addresses)**: R15 monthly for additional email addresses
- **SMS notifications**: 45c per SMS
- **Email payment notification (sent through the branch)**: R25

For the precise monthly administration fee, please check the latest account documents or contact Standard Bank customer service.

In [22]:
query = 'How much are cash withdrawals?'

# Generate embeddings for the query
query_embedding = model.generate_multimodal_embeddings(prompt=query,output_embedding_length=embedding_vector_dimension)

# Search for the nearest neighbors in the vector database
distances, result = index.search(np.array(query_embedding, dtype=np.float32).reshape(1,-1), k=5)

# Retrieve the matched items
matched_items = [{k: v for k, v in items[index].items() if k != 'embedding'} for index in result.flatten()]

# Generate RAG response with Amazon Nova
response = invoke_nova_multimodal(prompt=query, matched_items=matched_items)

# Display the response
display.Markdown(response)

Here's a summary of the cash withdrawal fees for Standard Bank:

### Withdrawals at Standard Bank

**ATM:**
- R2.65 per R100 or part thereof

**Branch:**
- R90 + R4 per R100 or part thereof

### Withdrawals at Other Banks

**ATM:**
- R2.65 per R100 or part thereof

**Branch:**
- Not specified (indicated by –)

### International Withdrawals

**ATM:**
- R3 per R100 or part thereof (min R70) + International transaction fee of 2.75%

**Branch:**
- Not specified (indicated by –)

### Coin Withdrawal

**ATM:**
- Not specified (indicated by –)

**Branch:**
- R90 + R15 per R100 or part thereof

### Notes and Coin Withdrawal

**ATM:**
- Not specified (indicated by –)

**Branch:**
- R90 + R4 per R100 (for notes) + R15 per R100 (for coins) or part thereof

### Cash for Cash (Change)

**ATM:**
- Not specified (indicated by –)

**Branch:**
- R90 + R15 per R100 or part thereof