# Environment Setting Up

In [1]:
import os
from dotenv import load_dotenv

# Loading environment variables from .env
load_dotenv()

# Changing directory to main directory for easy data access
working_directory = os.getenv("WORKING_DIRECTORY")
os.chdir(working_directory)

# Checking the change
%pwd

'/workspaces/Live-Air-Quality'

In [2]:
from pathlib import Path

# Checking the change
print("Git folder exists:", Path(".git").exists())

Git folder exists: True


# 2. S3 Exploration

In [3]:
import duckdb as ddb

ddb.sql("INSTALL httpfs; LOAD httpfs")

In [4]:
from AQI.utils.common import create_directories
from pathlib import Path

location = Path("research/sql/air_quality.db")
create_directories([location.parent])

conn = ddb.connect(location)

[2025-08-17 10:18:29,436: INFO: common: Directory: research/sql created successfully.]


In [5]:
conn.execute("CREATE schema IF NOT EXISTS raw")

<duckdb.duckdb.DuckDBPyConnection at 0x71033b24d9f0>

In [6]:
conn.sql("""
    SET s3_access_key_id='';
    SET s3_secret_access_key='';
    SET s3_region='';
""")

In [7]:
# ` " ` used when there is a conflict with duckdb keyword 

conn.execute("""
    CREATE TABLE IF NOT EXISTS raw.air_quality_data (
        location_id BIGINT,
        sensors_id BIGINT,
        "location" VARCHAR,
        "datetime" TIMESTAMP,
        lat DOUBLE,
        lon DOUBLE,
        "parameter" VARCHAR,
        units VARCHAR,
        "value" DOUBLE,
        "month" VARCHAR,
        "year" BIGINT,
        ingestion_datetime TIMESTAMP
    );
""")

<duckdb.duckdb.DuckDBPyConnection at 0x71033b24d9f0>

In [None]:
id = "384"
year = "2025"
month = "01"

conn.execute(f"""
INSERT INTO raw.air_quality_data
SELECT 
    location_id, 
    sensors_id, 
    "location", 
    "datetime", 
    lat, 
    lon, 
    "parameter", 
    units, 
    "value",
    "month", 
    "year",
    current_timestamp AS ingestion_datetime
FROM read_csv('s3://openaq-data-archive/records/csv.gz/locationid={id}/year={year}/month={month}/*.csv.gz');
""")

<duckdb.duckdb.DuckDBPyConnection at 0x71033b24d9f0>

In [9]:
conn.close()