# Environment Setting Up

In [1]:
import os
from dotenv import load_dotenv

# Loading environment variables from .env
load_dotenv()

# Changing directory to main directory for easy data access
working_directory = os.getenv("WORKING_DIRECTORY")
os.chdir(working_directory)

# Checking the change
%pwd

'/workspaces/Live-Air-Quality'

In [2]:
from pathlib import Path

# Checking the change
print("Git folder exists:", Path(".git").exists())

Git folder exists: True


# 1. Database Manager

In [None]:
from pathlib import Path
from duckdb import DuckDBPyConnection

import duckdb as ddb


def database_connect(db_path: Path, s3_config: dict | None = None) -> DuckDBPyConnection:
    """
    Connect to the DuckDB database at the specified path, and configures S3 access credentials for external data sources.

    Args:
        db_path (Path): Path to the DuckDB database file.
        s3_config (dict | None): AWS S3 credentials. Defaults to None.

    Returns:
        DuckDBPyConnection: Active database connection.
    """
    conn = ddb.connect(str(db_path))

    if s3_config:
        conn.sql(f"SET s3_access_key_id='{s3_config['access_key']}';")
        conn.sql(f"SET s3_secret_access_key='{s3_config['secret_key']}';")
        conn.sql(f"SET s3_region='{s3_config['region']}';")
    return conn


def database_close(conn: DuckDBPyConnection) -> None:
    """
    Close the DuckDB database connection.

    Args:
        conn (DuckDBPyConnection): Active database connection to be closed.

    Returns:
        None
    """
    if conn is not None and not conn.is_closed():
        conn.close()


def database_aggregate_sql_paths(dir: Path) -> list[Path]:
    """
    Recursively collect all `.sql` file paths from the specified directory.

    Args:
        dir (Path): Root directory to search for SQL script files.

    Returns:
        list[Path]: Sorted list of paths to `.sql` files.
    """
    sql_scripts = []
    
    for root, _, files in os.walk(dir):
        for file in files:
            if file.lower().endswith(".sql"):
                sql_scripts.append(Path(root) / file)
    
    return sorted(sql_scripts)


def database_load_query(query_path: Path) -> str:
    """
    Load a SQL query from a file.

    Args:
        query_path (Path): Path to the .sql file.

    Returns:
        str: The SQL query as a string.
    """
    with open(query_path, "r", encoding="utf-8") as file:
        return file.read()


def database_execute_sql_query(conn: DuckDBPyConnection, query: str) -> None:
    """
    Execute a SQL query on an active DuckDB connection.

    Args:
        conn (DuckDBPyConnection): Active DuckDB connection.
        query (str): SQL query to execute.
    """
    conn.execute(query)


def database_initialize(db_path: Path, ddl_dir: Path) -> None:
    """
    Initialize the DuckDB database schema from .sql files in the given directory.

    Args:
        db_path (Path): Path to the DuckDB database file.
        ddl_dir (Path): Directory containing .sql files for schema creation.
    """
    query_paths = database_aggregate_sql_paths(dir=ddl_dir)
    conn = database_connect(db_path=db_path)

    try:
        for query_path in query_paths:
            query = database_load_query(query_path=query_path)
            database_execute_sql_query(conn=conn, query=query)
    finally:
        # Making sure connection is always closed
        database_close(conn=conn)


def database_drop(db_path: Path) -> None:
    """
    Delete the DuckDB database file at the specified path.

    Args:
        db_path (Path): Path to the DuckDB database file.

    Returns:
        None
    """
    db_path.unlink(missing_ok=True)