In [1]:
!pip install -U google-generativeai



In [3]:
import google.generativeai as genai
from google.colab import userdata
import base64

genai.configure(api_key=userdata.get("VERTEX_API_KEY"))
model = genai.GenerativeModel("gemini-2.5-flash")


In [4]:
import gdown
file_id = "1oe2FZd3ZTO7nrDqjCafNvxicl08oF8JF"
download_url = f"https://drive.google.com/uc?id={file_id}"
gdown.download(download_url, "receipts.zip", quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1oe2FZd3ZTO7nrDqjCafNvxicl08oF8JF
To: /content/receipts.zip
100%|██████████| 1.61M/1.61M [00:00<00:00, 143MB/s]


'receipts.zip'

In [5]:
!unzip receipts.zip

Archive:  receipts.zip
  inflating: receipt1.jpg            
  inflating: __MACOSX/._receipt1.jpg  
  inflating: receipt2.jpg            
  inflating: __MACOSX/._receipt2.jpg  
  inflating: receipt3.jpg            
  inflating: __MACOSX/._receipt3.jpg  
  inflating: receipt4.jpg            
  inflating: __MACOSX/._receipt4.jpg  
  inflating: receipt5.jpg            
  inflating: __MACOSX/._receipt5.jpg  
  inflating: receipt6.jpg            
  inflating: __MACOSX/._receipt6.jpg  
  inflating: receipt7.jpg            
  inflating: __MACOSX/._receipt7.jpg  


In [6]:
def image_to_part(path):
    with open(path, "rb") as f:
        return {
            "mime_type": "image/jpeg",
            "data": base64.b64encode(f.read()).decode("utf-8")
        }


In [7]:
from PIL import Image

def load_image(path):
    return Image.open(path)

image_paths = [
    "/content/receipt1.jpg",
    "/content/receipt2.jpg",
    "/content/receipt3.jpg",
    "/content/receipt4.jpg",
    "/content/receipt5.jpg",
    "/content/receipt6.jpg",
    "/content/receipt7.jpg",
]

images = [load_image(p) for p in image_paths]

prompt = """
For EACH receipt, extract the following fields:

- receipt_id (1–7)
- total_paid: the final amount actually paid
- total_discount: the sum of all discounts applied
  (if no discount, return 0)
- currency

Return the result as STRICT JSON only.

Output format:
{
  "receipts": [
    {
      "id": 1,
      "total_paid": number | null,
      "total_discount": number | null,
      "note": string | null
    }
  ]
}

If any value is unclear, set it to null and explain in note.
"""

In [8]:

response = model.generate_content(
    images + [prompt]
)


In [9]:
import json
import re

raw_text = response.text

match = re.search(r"\{[\s\S]*\}", raw_text)
if not match:
    raise ValueError("No JSON object found in model output")

data = json.loads(match.group())
receipts = data["receipts"]

In [10]:
query1_total = sum(
    r["total_paid"]
    for r in receipts
    if r["total_paid"] is not None
)


In [11]:
query2_total = sum(
    r["total_paid"] + r["total_discount"]
    for r in receipts
    if r["total_paid"] is not None and r["total_discount"] is not None
)


In [12]:
output = {
    "receipts": receipts,
    "query_1": {
        "question": "How much money did I spend in total for these bills?",
        "answer": round(query1_total, 2)
    },
    "query_2": {
        "question": "How much would I have had to pay without the discount?",
        "answer": round(query2_total, 2)
    }
}

print(json.dumps(output, ensure_ascii=False, indent=2))


{
  "receipts": [
    {
      "id": 1,
      "total_paid": 394.7,
      "total_discount": 97.88,
      "currency": "HKD"
    },
    {
      "id": 2,
      "total_paid": 316.1,
      "total_discount": 56.19,
      "currency": "HKD"
    },
    {
      "id": 3,
      "total_paid": 140.8,
      "total_discount": 15.32,
      "currency": "HKD"
    },
    {
      "id": 4,
      "total_paid": 514.0,
      "total_discount": 78.28,
      "currency": "HKD"
    },
    {
      "id": 5,
      "total_paid": 102.3,
      "total_discount": 5.1,
      "currency": "HKD"
    },
    {
      "id": 6,
      "total_paid": 190.8,
      "total_discount": 30.31,
      "currency": "HKD"
    },
    {
      "id": 7,
      "total_paid": 315.6,
      "total_discount": 83.16,
      "currency": "HKD"
    }
  ],
  "query_1": {
    "question": "How much money did I spend in total for these bills?",
    "answer": 1974.3
  },
  "query_2": {
    "question": "How much would I have had to pay without the discount?",
    "ans

In [13]:
def test_query(answer, ground_truth_costs):
    # Convert string to float if necessary
    if isinstance(answer, str):
        answer = float(answer)

    # Calculate the ground truth sum once for clarity
    expected_total = sum(ground_truth_costs)

    # Check if the answer is within +/- $2 of the expected total
    assert abs(answer - expected_total) <= 2

In [15]:
query_1_costs = [394.7, 316.1, 140.8, 514.0, 102.3, 190.8, 315.6] # do not modify this
query1_answer = sum(query_1_costs)
test_query(query1_answer, query_1_costs)

In [16]:
query_2_costs = [480.20, 392.20, 160.10, 590.80, 107.70, 221.20, 396.00] # do not modify this
query2_answer = sum(query_2_costs)
test_query(query2_answer, query_2_costs)

In [17]:
sum([480.20, 392.20, 160.10, 590.80, 107.70, 221.20, 396.00])

2348.2