In [11]:
import duckdb
import os
import logging
import pytz
from dotenv import load_dotenv
from datetime import datetime
from pathlib import Path

In [13]:
# Setup date
date = datetime.now(pytz.timezone("Asia/Bangkok"))
day = date.strftime("%Y-%m-%d")

In [7]:
# Load Environment variable
load_dotenv()
MINIO_ACCESS_KEY = os.getenv("MINIO_ROOT_USER")
MINIO_SECRET_KEY = os.getenv("MINIO_ROOT_PASSWORD")
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT")
MINIO_BUCKET = os.getenv("MINIO_BUCKET")

if not MINIO_ACCESS_KEY or not MINIO_SECRET_KEY:
    raise ValueError("Missing MinIO credentials in .env file")

In [8]:
# Setup logging
log_dir = Path("/home/jovyan/work/data/logs")
log_dir.mkdir(parents=True, exist_ok=True)
log_path = log_dir / f"analyze_duckdb_{day}.log"

logging.basicConfig(
    filename=log_path,
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
)

logging.info("Start analyzing parquet with DuckDB")

In [9]:
# Set ENV for DuckDB
os.environ["AWS_ACCESS_KEY_ID"] = MINIO_ACCESS_KEY
os.environ["AWS_SECRET_ACCESS_KEY"] = MINIO_SECRET_KEY

In [10]:
# Prepare parquet path
parquet_path = f"s3://{MINIO_BUCKET}/{day}/flight_weather.parquet/part-00000*.parquet"

try:
    db = duckdb.connect()

    # Enable S3 (httpfs)
    db.execute("INSTALL httpfs;")
    db.execute("LOAD httpfs;")
    db.execute("SET s3_region='us-east-1';")
    db.execute(f"SET s3_endpoint='{MINIO_ENDPOINT}';")
    db.execute("SET s3_url_style='path';")
    db.execute("SET s3_use_ssl=false;")  # Because MinIO on local does not use HTTPS

    logging.info(f"Connected to MinIO at {parquet_path}")

    # Sample records
    logging.info("Sample records:")
    sample_df = db.execute(f"""
        SELECT *
        FROM '{parquet_path}'
        LIMIT 5
    """).fetchdf()
    print(sample_df)

    # Airline summary
    logging.info("Flight count by airline:")
    airline_df = db.execute(f"""
        SELECT airline, COUNT(*) as total_flights
        FROM '{parquet_path}'
        GROUP BY airline
        ORDER BY total_flights DESC
    """).fetchdf()
    print(airline_df)

    # Weather summary
    logging.info("Top weather conditions:")
    weather_df = db.execute(f"""
        SELECT weather, COUNT(*) as count
        FROM '{parquet_path}'
        GROUP BY weather
        ORDER BY count DESC
    """).fetchdf()
    print(weather_df)

    logging.info("Analysis complete!")

except Exception as e:
    logging.error(f"DuckDB analysis failed: {e}")
    print("Error during DuckDB analysis:", e)

  flight_iata departure_iata arrival_iata               arrival_time  \
0       TG640            BKK          NRT  2025-03-31T06:20:00+00:00   
1      NH5952            BKK          NRT  2025-03-31T06:20:00+00:00   
2      NH5954            BKK          NRT  2025-03-31T08:10:00+00:00   
3      NH5956            BKK          NRT  2025-03-30T15:45:00+00:00   
4       NH806            BKK          NRT  2025-03-30T15:20:00+00:00   

                      airline                   arr_airport   city iata  temp  \
0  Thai Airways International  Narita International Airport  Tokyo  NRT   7.5   
1                         ANA  Narita International Airport  Tokyo  NRT   7.5   
2                         ANA  Narita International Airport  Tokyo  NRT   7.5   
3                         ANA  Narita International Airport  Tokyo  NRT   7.5   
4                         ANA  Narita International Airport  Tokyo  NRT   7.5   

   humidity  weather  wind_speed  
0      56.0  Unknown        6.17  
1      56.