In [None]:
import logging
import requests
import time
from pyarrow import Table
from pyiceberg.catalog import load_catalog
from pyiceberg.schema import Schema
from pyiceberg.types import IntegerType, StringType
from pyiceberg.io.pyarrow import PyArrowFileIO
from pyiceberg.partitioning import PartitionSpec
from pyiceberg.expressions import day
import os

from datetime import datetime

today = datetime.today().strftime("%Y-%m-%d")

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("./api_ingest.log", mode='a', encoding='utf-8'),
        logging.StreamHandler()
    ]
)

logger = logging.getLogger("api_ingest")

# load iceberg catalog (minio)
catalog = load_catalog(
    "default",
    **{
        "type": "hive",
        "uri": "thrift://hive-metastore:9083",
        "warehouse": "s3://warehouse/",
        "s3.endpoint": "http://minio:9000",
        "s3.access-key-id": "minioadmin",
        "s3.secret-access-key": "minioadmin",
        "s3.path-style-access": "true",
        "s3.region": "us-east-1",
    },
)

logger.info("Iceberg catalog loaded.")


# Tạo bảng Iceberg nếu chưa có 
TABLE_NAME = "bronze.catagories"

schema = Schema(
    ("id", IntegerType(), False),
    ("name", StringType(), True),
    ("email", StringType(), True)
)

try:
    table = catalog.load_table(TABLE_NAME)
    logger.info("Table exists, loaded successfully.")
except Exception:
    logger.info("Table not found → creating new Iceberg table...")
    partition_spec = PartitionSpec.builder_for(schema).with_spec(
        day("update_date")
    ).build()
    table_properties = {
        "format-version": "2",
        "write.metadata.delete-after-commit.enabled": "true"
    }
    table = catalog.create_table(
        identifier=TABLE_NAME,
        schema=schema,
        partition_spec=partition_spec,
        properties=table_properties,
    )
    logger.info("Table created successfully.")


# Hàm gọi API
def fetch_api_data():
    api_url = "https://api.tiki.vn/raiden/v2/menu-config?platform=desktop"
    logger.info(f"Fetching from API: {api_url}")

    response = requests.get(api_url)
    response.raise_for_status()
    return response.json()


# Ghi dữ liệu vào iceberg theo batch
def write_batch_to_iceberg(records, batch_id):
    if not records:
        logger.warning("Empty batch → skip.")
        return

    # convert to PyArrow table
    table_arrow = Table.from_pylist(records)

    logger.info(f"Writing batch {batch_id} with {len(records)} rows...")

    
    with table.new_append() as append:
        append.append_table(table_arrow)

    logger.info(f"Batch {batch_id} written successfully.")


# Chạy ingestion theo batch
def run_ingest(batch_size=3):
    logger.info("Starting ingestion job...")

    data = fetch_api_data()

    batch = []
    batch_id = 1

    for item in data:
        row = {
            "id": item["id"],
            "name": item["name"],
            "email": item["email"]
        }

        batch.append(row)

        if len(batch) == batch_size:
            write_batch_to_iceberg(batch, batch_id)
            batch = []
            batch_id += 1

    # phần dư cuối cùng
    if batch:
        write_batch_to_iceberg(batch, batch_id)

    logger.info("Ingestion job completed.")


# Main
if __name__ == "__main__":
    run_ingest(batch_size=5)
