# Environment Setting Up

In [4]:
import os
from dotenv import load_dotenv

# Loading environment variables from .env
load_dotenv()

# Changing directory to main directory for easy data access
working_directory = os.getenv("WORKING_DIRECTORY")
os.chdir(working_directory)

# Checking the change
%pwd

'/workspaces/Live-Air-Quality'

In [5]:
from pathlib import Path

# Checking the change
print("Git folder exists:", Path(".git").exists())

Git folder exists: True


# 1. Location IDs

# 2. Database Manager

In [6]:
from duckdb import DuckDBPyConnection
from pathlib import Path
from AQI.utils.common import create_directories

import os
import duckdb as ddb

def database_connect(db_path: Path, s3_config: dict | None = None) -> DuckDBPyConnection:
    """
    Connect to the DuckDB database at the specified path, and configures S3 access credentials for external data sources.

    Args:
        db_path (Path): Path to the DuckDB database file.
        s3_config (dict | None): AWS S3 credentials. Defaults to None.

    Returns:
        DuckDBPyConnection: Active database connection.
    """
    conn = ddb.connect(str(db_path))

    if s3_config:
        conn.sql(f"SET s3_access_key_id='{s3_config['access_key']}';")
        conn.sql(f"SET s3_secret_access_key='{s3_config['secret_key']}';")
        conn.sql(f"SET s3_region='{s3_config['region']}';")
        
    return conn


def database_close(conn: DuckDBPyConnection) -> None:
    """
    Close the DuckDB database connection.

    Args:
        conn (DuckDBPyConnection): Active database connection to be closed.

    Returns:
        None
    """
    if conn is not None:
        conn.close()


def database_aggregate_sql_paths(dir: Path) -> list[Path]:
    """
    Recursively collect all `.sql` file paths from the specified directory.

    Args:
        dir (Path): Root directory to search for SQL script files.

    Returns:
        list[Path]: Sorted list of paths to `.sql` files.
    """
    sql_scripts = []
    
    for root, _, files in os.walk(dir):
        for file in files:
            if file.lower().endswith(".sql"):
                sql_scripts.append(Path(root) / file)
    
    return sorted(sql_scripts)


def database_load_query(query_path: Path) -> str:
    """
    Load a SQL query from a file.

    Args:
        query_path (Path): Path to the .sql file.

    Returns:
        str: The SQL query as a string.
    """
    with open(query_path, "r", encoding="utf-8") as file:
        return file.read()


def database_execute_sql_query(conn: DuckDBPyConnection, query: str) -> None:
    """
    Execute a SQL query on an active DuckDB connection.

    Args:
        conn (DuckDBPyConnection): Active DuckDB connection.
        query (str): SQL query to execute.
    """
    conn.execute(query)


def database_initialize(db_path: Path, ddl_dir: Path) -> None:
    """
    Initialize the DuckDB database schema from .sql files in the given directory.

    Args:
        db_path (Path): Path to the DuckDB database file.
        ddl_dir (Path): Directory containing .sql files for schema creation.
    """
    create_directories([db_path.parent])

    query_paths = database_aggregate_sql_paths(dir=ddl_dir)
    conn = database_connect(db_path=db_path)

    try:
        for query_path in query_paths:
            query = database_load_query(query_path=query_path)
            database_execute_sql_query(conn=conn, query=query)
    finally:
        # Making sure connection is always closed
        database_close(conn=conn)


def database_drop(db_path: Path) -> None:
    """
    Delete the DuckDB database file at the specified path.

    Args:
        db_path (Path): Path to the DuckDB database file.

    Returns:
        None
    """
    db_path.unlink(missing_ok=True)

### Testing commands

In [4]:
db_location = Path('artifacts/sql/air_quality.db')
ddl_location = Path('src/AQI/sql/ddl')
creation_or_deletion = True

In [7]:
database_initialize(db_path=db_location, ddl_dir=ddl_location)

[2025-09-01 13:40:55,177: INFO: common: Directory: artifacts/sql created successfully.]


In [6]:
database_drop(db_path=db_location)

# 3. Data Extraction

In [1]:
from datetime import datetime
from dateutil.relativedelta import relativedelta
from jinja2 import Template
from pathlib import Path

from AQI.utils.common import load_json


def insert_api_data(location_path: Path, db_path: Path, query_path: Path, start_date: str | None, end_date: str | None) -> None:
    """Insert OpenAQ data into database over a date range."""
    # Step 1: Get location IDs
    location_ids = extract_location_ids(file_path=location_path)

    # Step 2: Connect to DB
    conn = database_connect(db_path=db_path)

    try:
        # Step 3: Load SQL template
        query_template  = database_load_query(query_path)

        # Step 4: Parse dates
        start = parse_date(start_date)
        end = parse_date(end_date, default=datetime.now())

        # Step 5: Generate date range
        date_range = generate_range(start, end)

        # Step 6: Loop over IDs and months
        for location_id in location_ids:
            for curr_date in date_range:
                api_path = render_openaq_path(location_id, year=curr_date.year, month=curr_date.month)
                extraction_query = render_query(query_template , api_path)

                try:
                    database_execute_sql_query(conn=conn, query=extraction_query)
                except Exception as e:
                    print(f"Failed: id={location_id}, date={curr_date:%Y-%m}, error={e}")
    
    finally:
        database_close(conn=conn)


def extract_location_ids(file_path: Path) -> list[str]:
    """Extract location IDs as strings from a JSON file."""
    locations = load_json(file_path)
    return [str(id) for id in locations.keys()]


def parse_date(date_str: str | None, default: datetime | None = None) -> datetime:
    """Parse YYYY-MM string into datetime, fallback to default or now()."""
    if date_str:
        return datetime.strptime(date_str, "%Y-%m")
    return default or datetime.now()


def generate_range(start: datetime, end: datetime) -> list[datetime]:
    """Return list of month starts between start and end (inclusive)."""
    return [start  + relativedelta(months=i) for i in range((end.year - start.year) * 12 + (end.month - start.month + 1))]


def render_query(query: str, api_path: str) -> str:
    """Render SQL query with template substitution."""
    return Template(query).render(data_file_path=api_path)


def render_openaq_path(location_id: str, year: int, month: int) -> str:
    """Generate OpenAQ S3 path for a location and month."""
    return f"s3://openaq-data-archive/records/csv.gz/locationid={location_id}/year={year}/month={month}/*.csv.gz"