In [None]:
'''
This is a script to take raw csvs in data/raw_csvs folder and covert them to 
parquets for manageable GitHub storage and limited memory usage (DuckDB).
'''

import pandas as pd
from pathlib import Path

# Set input and output directories
input_dir = Path("data/raw_csvs")
output_dir = Path("data")

# Make sure the output directory exists
output_dir.mkdir(parents=True, exist_ok=True)

# Loop through all CSV files in the input directory
for csv_file in input_dir.glob("*.csv"):
    print(f"Converting {csv_file.name}...")

    # Read CSV into DataFrame
    df = pd.read_csv(csv_file)

    # Create output path by replacing .csv with .parquet
    parquet_file = output_dir / csv_file.with_suffix(".parquet").name

    # Write to Parquet
    df.to_parquet(parquet_file, engine="pyarrow", index=False)
    print(f"Saved to {parquet_file}")

     # Delete original CSV
    csv_file.unlink()
    print(f"Deleted original CSV: {csv_file.name}")

print("✅ CSV to Parquet conversion complete.")

In [1]:
'''
This code updates the asset data

'''

import duckdb
import os
from dotenv import load_dotenv
from urllib.parse import quote_plus

load_dotenv()

# Build SQLAlchemy engine for PostgreSQL
user = quote_plus(os.getenv("CLIMATETRACE_USER"))
password = quote_plus(os.getenv("CLIMATETRACE_PASS"))
host = os.getenv("CLIMATETRACE_HOST")
port = os.getenv("CLIMATETRACE_PORT")
database = os.getenv("CLIMATETRACE_DB")

postgres_url = f"postgresql://{user}:{password}@{host}:{port}/{database}"
parquet_path = "data/asset_emissions_country_subsector.parquet"

# Use DuckDB to write directly from PostgreSQL to Parquet
con = duckdb.connect()


print("Getting max month...")
max_date = con.execute(f"""
    select max(start_time)
    from postgres_scan('{postgres_url}', 'public', 'asset_emissions')                       
""").fetchone()[0]

print("Running asset-level query and writing to parquet file, this may take a while...")
con.execute(f"""
    INSTALL postgres;
    LOAD postgres;

    CREATE TABLE asset_emissions_parquet AS
    SELECT ae.iso3_country,
        ae.original_inventory_sector,
        ae.start_time,
        ae.gas,
        sch.sector,
        ca.name as country_name,
        ca.continent,
        ca.unfccc_annex,
        ca.em_finance,
        ca.eu,
        ca.oecd,
        ca.developed_un,
        ae.release,
        sum(emissions_quantity) emissions_quantity,
        sum(activity) activity,
        sum(emissions_quantity) / sum(activity) weighted_average_emissions_factor
    
    FROM postgres_scan('{postgres_url}', 'public', 'asset_emissions') ae
    LEFT JOIN postgres_scan('{postgres_url}', 'public', 'country_analysis') ca
        ON CAST(ca.iso3_country AS VARCHAR) = CAST(ae.iso3_country AS VARCHAR)
    LEFT JOIN (
        SELECT DISTINCT sector, subsector FROM postgres_scan('{postgres_url}', 'public', 'asset_schema')
    ) sch
        ON CAST(sch.subsector AS VARCHAR) = CAST(ae.original_inventory_sector AS VARCHAR)
    
    WHERE ae.start_time >= (
                date_trunc('year', DATE '{max_date}') - INTERVAL '3 YEARS'
            )
      AND ae.gas in ('co2e_100yr','ch4')
      AND ae.most_granular = TRUE
    
    GROUP BY ae.iso3_country,
        ae.original_inventory_sector,
        ae.start_time,
        ae.gas,
        sch.sector,
        ca.name,
        ca.continent,
        ca.unfccc_annex,
        ca.em_finance,
        ca.eu,
        ca.oecd,
        ca.developed_un,
        ae.release;

    COPY asset_emissions_parquet TO '{parquet_path}' (FORMAT PARQUET);
""")
con.close()

print("✅ Asset parquet file exported")

Getting max month...
Running asset-level query and writing to parquet file, this may take a while...
✅ Asset parquet file exported


In [3]:
# ------------------------------------ Asset Annual Emissions ------------------------------------

import duckdb
import os
from dotenv import load_dotenv
from urllib.parse import quote_plus

load_dotenv()

# Build SQLAlchemy engine for PostgreSQL
user = quote_plus(os.getenv("CLIMATETRACE_USER"))
password = quote_plus(os.getenv("CLIMATETRACE_PASS"))
host = os.getenv("CLIMATETRACE_HOST")
port = os.getenv("CLIMATETRACE_PORT")
database = os.getenv("CLIMATETRACE_DB")

postgres_url = f"postgresql://{user}:{password}@{host}:{port}/{database}"
parquet_path = "data/emissions_reduction/asset_annual_emissions.parquet"

# Use DuckDB to write directly from PostgreSQL to Parquet
con = duckdb.connect()

print('Running query...')
con.execute( f'''
	INSTALL postgres;
	LOAD postgres;

	CREATE TABLE asset_annual_emissions_parquet AS
	select extract(year from ae.start_time) as year
		, ae.asset_id
		, ai.asset_type
		, CASE 
				WHEN ae.original_inventory_sector = 'iron-and-steel' AND ai.asset_type LIKE '%BF%' 
					THEN '{{''iron-and-steel'': [''BF'', ''DRI-EAF'']}}'
				WHEN ae.original_inventory_sector = 'aluminum' AND ai.asset_type LIKE '%Refinery%' 
					THEN '{{''aluminum'': [''Refinery'']}}'
				WHEN ae.original_inventory_sector = 'aluminum' AND ai.asset_type LIKE '%Smelting%' 
					THEN '{{''aluminum'': [''Smelting'']}}'
				ELSE 'all' 
			END AS asset_type_2
		, ai.asset_name
		, ae.iso3_country
		, ca.name as country_name
        , abc.region balancing_authority_region
        , ca.continent
        , ca.eu
        , ca.oecd
        , ca.unfccc_annex
        , ca.developed_un
        , ca.em_finance
		, asch.sector
		, ae.original_inventory_sector as subsector
		, al.gadm_1
		, al.gadm_2
		, al.ghs_fua
		, al.city_id
		, ae.other1
		, ae.other2
		, ae.other3
		, ae.other4
		, ae.other5
		, ae.other6
		, ae.other7
		, ae.other8
		, ae.other9
		, ae.other10
		, ae.activity_units
		, sum(capacity) capacity
		, sum(activity) activity
		, avg(emissions_factor) average_emissions_factor
		, sum(emissions_quantity) emissions_quantity
        , ers.strategy_id
		, ers.strategy_name
		, ers.strategy_description
		, ers.mechanism
		, ers.old_activity
		, ers.affected_activity
		, ers.old_emissions_factor
		, ers.new_emissions_factor
		, ers.emissions_reduced_at_asset
		, ers.induced_sector_1
		, ers.induced_sector_1_induced_emissions
		, ers.induced_sector_2
		, ers.induced_sector_2_induced_emissions
		, ers.induced_sector_3
		, ers.induced_sector_3_induced_emissions
		, ers.total_emissions_reduced_per_year

	from postgres_scan('{postgres_url}','public', 'asset_emissions') ae
	left join postgres_scan('{postgres_url}','public', 'asset_information') ai
		on ai.asset_id = ae.asset_id
	left join postgres_scan('{postgres_url}','public', 'asset_location') al
		on al.asset_id = ae.asset_id
	left join (
		select distinct sector, subsector from postgres_scan('{postgres_url}','public', 'asset_schema')
	) asch
		on cast(asch.subsector as varchar) = cast(ae.original_inventory_sector as varchar)
	left join postgres_scan('{postgres_url}','public', 'country_analysis') ca
		on cast(ca.iso3_country as varchar) = cast(ae.iso3_country as varchar)
    left join postgres_scan('{postgres_url}','public', 'asset_ba_crosswalk') abc
		on abc.asset_id = ae.asset_id
    left join (
		select rdf.* 

		from postgres_scan('{postgres_url}','public','reductions_data_fusion') rdf
		inner join (
			SELECT *
			FROM postgres_scan('{postgres_url}','public', 'strategy_crosswalk_staging')
			WHERE strategy_rank = 1
			
			UNION
			
			SELECT *
			FROM postgres_scan('{postgres_url}','public', 'strategy_crosswalk_staging') a
			WHERE strategy_rank IS NULL
			AND NOT EXISTS (
				SELECT 1
				FROM postgres_scan('{postgres_url}','public', 'strategy_crosswalk_staging') b
				WHERE a.asset_id = b.asset_id 
					AND b.strategy_rank IS NOT NULL
			)
		) sc
			on sc.asset_id = rdf.asset_id
			and sc.strategy_id = rdf.strategy_id
			and rdf.gas = 'co2e_100yr'
    ) ers
		on ers.asset_id = ae.asset_id

	where extract(year from ae.start_time) = 2024
		and ae.most_granular = true
		and ae.gas = 'co2e_100yr'
		and ae.original_inventory_sector not in ('forest-land-clearing',
													'forest-land-degradation',
													'forest-land-fires',
													'net-forest-land',
													'net-shrubgrass',
													'net-wetland',
													'removals',
													'shrubgrass-fires',
													'water-reservoirs',
													'wetland-fires')

	group by extract(year from ae.start_time)
		, ae.asset_id
		, ai.asset_type
        , CASE 
				WHEN ae.original_inventory_sector = 'iron-and-steel' AND ai.asset_type LIKE '%BF%' 
					THEN '{{''iron-and-steel'': [''BF'', ''DRI-EAF'']}}'
				WHEN ae.original_inventory_sector = 'aluminum' AND ai.asset_type LIKE '%Refinery%' 
					THEN '{{''aluminum'': [''Refinery'']}}'
				WHEN ae.original_inventory_sector = 'aluminum' AND ai.asset_type LIKE '%Smelting%' 
					THEN '{{''aluminum'': [''Smelting'']}}'
				ELSE 'all' 
			END
		, ai.asset_name
		, ae.iso3_country
		, ca.name
        , abc.region
        , ca.continent
        , ca.eu
        , ca.oecd
        , ca.unfccc_annex
        , ca.developed_un
        , ca.em_finance
		, asch.sector
		, ae.original_inventory_sector
		, al.gadm_1
		, al.gadm_2
		, al.ghs_fua
		, al.city_id
		, ae.other1
		, ae.other2
		, ae.other3
		, ae.other4
		, ae.other5
		, ae.other6
		, ae.other7
		, ae.other8
		, ae.other9
		, ae.other10
		, ae.activity_units
        , ers.strategy_id
		, ers.strategy_name
		, ers.strategy_description
		, ers.mechanism
		, ers.old_activity
		, ers.affected_activity
		, ers.old_emissions_factor
		, ers.new_emissions_factor
		, ers.emissions_reduced_at_asset
		, ers.induced_sector_1
		, ers.induced_sector_1_induced_emissions
		, ers.induced_sector_2
		, ers.induced_sector_2_induced_emissions
		, ers.induced_sector_3
		, ers.induced_sector_3_induced_emissions
		, ers.total_emissions_reduced_per_year;
            
    COPY asset_annual_emissions_parquet TO '{parquet_path}' (FORMAT PARQUET);
            
    ''')

con.close()

print('Complete')

Running query...
Complete


In [4]:
## ---------------------------------- ADD MOER FACTORS --------------------------------------

# import duckdb
from utils.utils import data_add_moer
import pandas as pd

asset_parquet_path = 'data/emissions_reduction/asset_annual_emissions.parquet'
output_path = 'data/emissions_reduction/asset_annual_emissions_moer.parquet'

df_asset = pd.read_parquet(asset_parquet_path)

asset_moer_df = data_add_moer(df_asset, cond={"moer": True})

asset_moer_df.to_parquet(output_path, index=False)


In [5]:
# ------------------------------------ SPLITS LARGE ASSET FILE INTO ~50MB CHUNKS ---------------------------------


import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os

# === CONFIG ===
input_file = "data/emissions_reduction/asset_annual_emissions_moer.parquet"  # Your large file
output_dir = "data/asset_annual_emissions"  # Destination folder
target_size_mb = 50  # Keep each file safely under 100MB
os.makedirs(output_dir, exist_ok=True)

# Load full Parquet into DataFrame
df = pd.read_parquet(input_file)
total_rows = len(df)

# Estimate file size per row using a small sample
test_sample = df.iloc[:10000]
test_table = pa.Table.from_pandas(test_sample)
pq.write_table(test_table, "temp.parquet")
bytes_per_row = os.path.getsize("temp.parquet") / len(test_sample)
os.remove("temp.parquet")

# Determine number of rows per ~50MB chunk
target_bytes = target_size_mb * 1024 * 1024
rows_per_chunk = int(target_bytes / bytes_per_row)

# Split and write files
for i, start in enumerate(range(0, total_rows, rows_per_chunk)):
    end = min(start + rows_per_chunk, total_rows)
    chunk_df = df.iloc[start:end]
    chunk_table = pa.Table.from_pandas(chunk_df)
    output_path = os.path.join(output_dir, f"chunk_{i+1}.parquet")
    pq.write_table(chunk_table, output_path)
    size_mb = os.path.getsize(output_path) / (1024 * 1024)
    print(f"Saved {output_path} ({size_mb:.1f} MB, rows {start}–{end})")


Saved data/asset_annual_emissions/chunk_1.parquet (51.1 MB, rows 0–428377)
Saved data/asset_annual_emissions/chunk_2.parquet (51.2 MB, rows 428377–856754)
Saved data/asset_annual_emissions/chunk_3.parquet (51.3 MB, rows 856754–1285131)
Saved data/asset_annual_emissions/chunk_4.parquet (51.2 MB, rows 1285131–1713508)
Saved data/asset_annual_emissions/chunk_5.parquet (51.2 MB, rows 1713508–2141885)
Saved data/asset_annual_emissions/chunk_6.parquet (44.8 MB, rows 2141885–2513420)


In [5]:

# ------------------------------------ GADM 1 Emissions ------------------------------------

import duckdb
import os
from dotenv import load_dotenv
from urllib.parse import quote_plus

load_dotenv()

# Build SQLAlchemy engine for PostgreSQL
user = quote_plus(os.getenv("CLIMATETRACE_USER"))
password = quote_plus(os.getenv("CLIMATETRACE_PASS"))
host = os.getenv("CLIMATETRACE_HOST")
port = os.getenv("CLIMATETRACE_PORT")
database = os.getenv("CLIMATETRACE_DB")

postgres_url = f"postgresql://{user}:{password}@{host}:{port}/{database}"
parquet_path = "data/emissions_reduction/gadm_1_emissions.parquet"

# Use DuckDB to write directly from PostgreSQL to Parquet
con = duckdb.connect()


print('Running query')
con.execute(f'''
    INSTALL postgres;
    LOAD postgres;

    CREATE TABLE gadm_1_emissions_parquet AS
    select extract(year from g1e.start_time) as year 
        , g1e.gadm_id
        , gb.gid
        , gb.admin_level
        , g1e.iso3_country
        , ca.name as country_name
        , gb.name gadm_1_name
        , gb.corrected_name gadm_1_corrected_name
        , ca.continent
        , ca.eu
        , ca.oecd
        , ca.unfccc_annex
        , ca.developed_un
        , ca.em_finance
        , asch.sector
        , g1e.original_inventory_sector subsector
        , g1e.gas
        , sum(asset_activity) asset_activity
        , sum(asset_emissions) asset_emissions
        , sum(remainder_activity) remainder_activity
        , sum(remainder_emissions) remainder_emissions
        , sum(asset_emissions) + sum(remainder_emissions) as emissions_quantity

    from postgres_scan('{postgres_url}', 'public', 'gadm_1_emissions') g1e
    inner join (
        select distinct gadm_id
            , gid
            , name
            , corrected_name
            , admin_level
        from postgres_scan('{postgres_url}','public', 'gadm_boundaries') 
        where admin_level = 1
    ) as gb
        on g1e.gadm_id = gb.gadm_id
    left join (
        select distinct sector
            , subsector
        from postgres_scan('{postgres_url}','public', 'asset_schema') 
    ) asch
        on cast(asch.subsector as varchar) = cast(g1e.original_inventory_sector as varchar)
    left join postgres_scan('{postgres_url}','public', 'country_analysis') ca
		on cast(ca.iso3_country as varchar) = cast(g1e.iso3_country as varchar)

    where g1e.gas = 'co2e_100yr'
        and extract(year from start_time) = 2024
        and g1e.original_inventory_sector not in ('forest-land-clearing',
                                                'forest-land-degradation',
                                                'forest-land-fires',
                                                'net-forest-land',
                                                'net-shrubgrass',
                                                'net-wetland',
                                                'removals',
                                                'shrubgrass-fires',
                                                'water-reservoirs',
                                                'wetland-fires')

    group by extract(year from g1e.start_time) 
        , g1e.gadm_id
        , gb.gid
        , gb.admin_level
        , g1e.iso3_country
        , ca.name
        , gb.name 
        , gb.corrected_name
        , ca.continent
        , ca.eu
        , ca.oecd
        , ca.unfccc_annex
        , ca.developed_un
        , ca.em_finance
        , asch.sector
        , g1e.original_inventory_sector
        , g1e.gas;

    COPY gadm_1_emissions_parquet TO '{parquet_path}' (FORMAT PARQUET);
''')
con.close()

print('Complete')

Running query
Complete


In [6]:
# --------------------------------------------------------- GADM 2 BATCH -----------------------------------------------------------------

import psycopg2
from urllib.parse import quote_plus
import pyarrow as pa
import pyarrow.parquet as pq
import csv
import os

user = quote_plus(os.getenv("CLIMATETRACE_USER"))
password = quote_plus(os.getenv("CLIMATETRACE_PASS"))
host = os.getenv("CLIMATETRACE_HOST")
port = os.getenv("CLIMATETRACE_PORT")
database = os.getenv("CLIMATETRACE_DB")

conn = psycopg2.connect(
    dbname=database,
    user=user,
    password=password,
    host=host,
    port=port
)

cur = conn.cursor(name='parquet_cursor')  # server-side cursor


cur.execute("""
     select extract(year from ge.start_time) as year 
        , gb1.gadm_id gadm_1_id
        , gb1.name gadm_1_name
        , gb1.corrected_name gadm_1_corrected_name
        , ge.gadm_id gadm_2_id
        , gb2.name gadm_2_name
        , gb2.corrected_name gadm_2_corrected_name
        , gb2.gid
        , gb2.admin_level
        , ge.iso3_country
        , ca.name as country_name
        , ca.continent
        , ca.eu
        , ca.oecd
        , ca.unfccc_annex
        , ca.developed_un
        , ca.em_finance
        , asch.sector
        , ge.original_inventory_sector subsector
        , sum(asset_activity) asset_activity
        , sum(asset_emissions) asset_emissions
        , sum(remainder_activity) remainder_activity
        , sum(remainder_emissions) remainder_emissions
        , sum(asset_emissions) + sum(remainder_emissions) as emissions_quantity

    from gadm_emissions ge
    inner join (
        select distinct gadm_id
            , gid
            , immediate_parent
            , name
            , corrected_name
            , admin_level
        from gadm_boundaries
        where admin_level = 2
    ) as gb2
        on ge.gadm_id = gb2.gadm_id
    left join (
        select distinct sector
            , subsector
        from asset_schema
    ) asch
        on cast(asch.subsector as varchar) = cast(ge.original_inventory_sector as varchar)
    left join (
        select gadm_id
            , name
            , corrected_name
        from gadm_boundaries
        where admin_level = 1
    ) gb1
        on gb1.gadm_id = gb2.immediate_parent
    left join country_analysis ca
        on cast(ca.iso3_country as varchar) = cast(ge.iso3_country as varchar)

    where ge.gas = 'co2e_100yr'
        and extract(year from start_time) = 2024
        and ge.original_inventory_sector not in ('forest-land-clearing',
                                                'forest-land-degradation',
                                                'forest-land-fires',
                                                'net-forest-land',
                                                'net-shrubgrass',
                                                'net-wetland',
                                                'removals',
                                                'shrubgrass-fires',
                                                'water-reservoirs',
                                                'wetland-fires')

    group by extract(year from ge.start_time)
        , gb1.gadm_id 
        , gb1.name
        , gb1.corrected_name
        , ge.gadm_id 
        , gb2.name
        , gb2.corrected_name
        , gb2.gid
        , gb2.admin_level
        , ge.iso3_country
        , ca.name
        , ca.continent
        , ca.eu
        , ca.oecd
        , ca.unfccc_annex
        , ca.developed_un
        , ca.em_finance
        , asch.sector
        , ge.original_inventory_sector
    """)

# Set up Parquet writer
batch_size = 10000
output_file = "data/emissions_reduction/gadm_2_emissions.parquet"
batch_count = 0
total_rows = 0

print("executing gadm_2 query...")

# Fetch first batch
rows = cur.fetchmany(batch_size)
if not rows:
    raise Exception("No data returned from query.")

field_names = [desc[0] for desc in cur.description]
first_table = pa.Table.from_pylist([dict(zip(field_names, row)) for row in rows])
writer = pq.ParquetWriter(output_file, first_table.schema)
writer.write_table(first_table)
batch_count += 1
total_rows += len(rows)
print(f"Processed batch {batch_count} ({len(rows)} rows), total rows: {total_rows}")

# Process remaining batches
while True:
    rows = cur.fetchmany(batch_size)
    if not rows:
        break

    table = pa.Table.from_pylist([dict(zip(field_names, row)) for row in rows])
    table = table.cast(writer.schema)  # ensure schema matches first batch
    writer.write_table(table)

    batch_count += 1
    total_rows += len(rows)
    print(f"Processed batch {batch_count} ({len(rows)} rows), total rows: {total_rows}")

writer.close()
cur.close()
conn.close()
print("Export complete.")

executing gadm_2 query...
Processed batch 1 (10000 rows), total rows: 10000
Processed batch 2 (10000 rows), total rows: 20000
Processed batch 3 (10000 rows), total rows: 30000
Processed batch 4 (10000 rows), total rows: 40000
Processed batch 5 (10000 rows), total rows: 50000
Processed batch 6 (10000 rows), total rows: 60000
Processed batch 7 (10000 rows), total rows: 70000
Processed batch 8 (10000 rows), total rows: 80000
Processed batch 9 (10000 rows), total rows: 90000
Processed batch 10 (10000 rows), total rows: 100000
Processed batch 11 (10000 rows), total rows: 110000
Processed batch 12 (10000 rows), total rows: 120000
Processed batch 13 (10000 rows), total rows: 130000
Processed batch 14 (10000 rows), total rows: 140000
Processed batch 15 (10000 rows), total rows: 150000
Processed batch 16 (10000 rows), total rows: 160000
Processed batch 17 (10000 rows), total rows: 170000
Processed batch 18 (10000 rows), total rows: 180000
Processed batch 19 (10000 rows), total rows: 190000
Proc

In [7]:
# ------------------------------------ GADM_0 Emissions ------------------------------------

import duckdb
import os
from dotenv import load_dotenv
from urllib.parse import quote_plus

load_dotenv()

# Build SQLAlchemy engine for PostgreSQL
user = quote_plus(os.getenv("CLIMATETRACE_USER"))
password = quote_plus(os.getenv("CLIMATETRACE_PASS"))
host = os.getenv("CLIMATETRACE_HOST")
port = os.getenv("CLIMATETRACE_PORT")
database = os.getenv("CLIMATETRACE_DB")

postgres_url = f"postgresql://{user}:{password}@{host}:{port}/{database}"
parquet_path = "data/emissions_reduction/gadm_0_emissions.parquet"

# Use DuckDB to write directly from PostgreSQL to Parquet
con = duckdb.connect()


print('Running query')
con.execute(f'''
    INSTALL postgres;
    LOAD postgres;

    CREATE TABLE gadm_0_emissions_parquet AS
    select extract(year from g0e.start_time) as year 
        , g0e.gadm_id
        , gb.gid
        , gb.admin_level
        , g0e.iso3_country
        , ca.name as country_name
        , gb.name gadm_0_name
        , gb.corrected_name gadm_0_corrected_name
        , ca.continent
        , ca.eu
        , ca.oecd
        , ca.unfccc_annex
        , ca.developed_un
        , ca.em_finance
        , asch.sector
        , g0e.original_inventory_sector subsector
        , g0e.gas
        , sum(asset_activity) asset_activity
        , sum(asset_emissions) asset_emissions
        , sum(remainder_activity) remainder_activity
        , sum(remainder_emissions) remainder_emissions
        , sum(asset_emissions) + sum(remainder_emissions) as emissions_quantity

    from postgres_scan('{postgres_url}', 'public', 'gadm_0_emissions') g0e
    inner join (
        select distinct gadm_id
            , gid
            , name
            , corrected_name
            , admin_level
        from postgres_scan('{postgres_url}','public', 'gadm_boundaries') 
        where admin_level = 0
    ) as gb
        on g0e.gadm_id = gb.gadm_id
    left join (
        select distinct sector
            , subsector
        from postgres_scan('{postgres_url}','public', 'asset_schema') 
    ) asch
        on cast(asch.subsector as varchar) = cast(g0e.original_inventory_sector as varchar)
    left join postgres_scan('{postgres_url}','public', 'country_analysis') ca
		on cast(ca.iso3_country as varchar) = cast(g0e.iso3_country as varchar)

    where g0e.gas = 'co2e_100yr'
        and extract(year from start_time) = 2024
        and g0e.original_inventory_sector not in ('forest-land-clearing',
                                                'forest-land-degradation',
                                                'forest-land-fires',
                                                'net-forest-land',
                                                'net-shrubgrass',
                                                'net-wetland',
                                                'removals',
                                                'shrubgrass-fires',
                                                'water-reservoirs',
                                                'wetland-fires')

    group by extract(year from g0e.start_time) 
        , g0e.gadm_id
        , gb.gid
        , gb.admin_level
        , g0e.iso3_country
        , ca.name
        , gb.name 
        , gb.corrected_name
        , ca.continent
        , ca.eu
        , ca.oecd
        , ca.unfccc_annex
        , ca.developed_un
        , ca.em_finance
        , asch.sector
        , g0e.original_inventory_sector
        , g0e.gas;

    COPY gadm_0_emissions_parquet TO '{parquet_path}' (FORMAT PARQUET);
''')
con.close()

print('Complete')

Running query
Complete


In [8]:
# ------------------------------------ City Emissions ------------------------------------


import duckdb
import os
from dotenv import load_dotenv
from urllib.parse import quote_plus

load_dotenv()

# Build SQLAlchemy engine for PostgreSQL
user = quote_plus(os.getenv("CLIMATETRACE_USER"))
password = quote_plus(os.getenv("CLIMATETRACE_PASS"))
host = os.getenv("CLIMATETRACE_HOST")
port = os.getenv("CLIMATETRACE_PORT")
database = os.getenv("CLIMATETRACE_DB")

postgres_url = f"postgresql://{user}:{password}@{host}:{port}/{database}"
parquet_path = "data/emissions_reduction/city_emissions.parquet"

# Use DuckDB to write directly from PostgreSQL to Parquet
con = duckdb.connect()

print('Running query...')
con.execute( f'''
	INSTALL postgres;
	LOAD postgres;

	CREATE TABLE city_emissions_parquet AS
    
	select extract(year from start_time) as year
		, ce.city_id
		, cb.name as city_name
		, cb.corrected_name as corrected_name
		, ce.iso3_country
		, ca.name as country_name
        , ca.continent
        , ca.eu
        , ca.oecd
        , ca.unfccc_annex
        , ca.developed_un
        , ca.em_finance
		, asch.sector
		, ce.original_inventory_sector as subsector
		, sum(asset_activity) asset_activity
		, sum(asset_emissions) asset_emissions
		, sum(remainder_activity) remainder_activity
		, sum(remainder_emissions) remainder_emissions
		, sum(asset_emissions) + sum(remainder_emissions) as emissions_quantity

	from postgres_scan('{postgres_url}','public', 'city_emissions') ce
	left join postgres_scan('{postgres_url}','public', 'city_boundaries') cb
		on cb.city_id = ce.city_id
        and cb.reporting_entity = 'ghs-fua'
	left join (
		select distinct sector, subsector
		from postgres_scan('{postgres_url}','public', 'asset_schema')
	) asch
		on cast(asch.subsector as varchar) = cast(ce.original_inventory_sector as varchar)
	left join postgres_scan('{postgres_url}','public', 'country_analysis') ca
		on cast(ca.iso3_country as varchar) = cast(ce.iso3_country as varchar)

	where extract(year from ce.start_time) = 2024
		and ce.gas = 'co2e_100yr'
		and ce.original_inventory_sector not in ('forest-land-clearing',
														'forest-land-degradation',
														'forest-land-fires',
														'net-forest-land',
														'net-shrubgrass',
														'net-wetland',
														'removals',
														'shrubgrass-fires',
														'water-reservoirs',
														'wetland-fires')

	group by extract(year from start_time) 
		, ce.city_id
		, cb.name 
		, cb.corrected_name 
		, ce.iso3_country
		, ca.name 
        , ca.continent
        , ca.eu
        , ca.oecd
        , ca.unfccc_annex
        , ca.developed_un
        , ca.em_finance
		, asch.sector
		, ce.original_inventory_sector;
            
    COPY city_emissions_parquet TO '{parquet_path}' (FORMAT PARQUET);
            
    ''')

con.close()

print('Complete')

Running query...
Complete


In [9]:
# # ------------------------------------ GADM 2 Emissions ------------------------------------

# import duckdb
# import os
# from dotenv import load_dotenv
# from urllib.parse import quote_plus

# load_dotenv()

# # Build SQLAlchemy engine for PostgreSQL
# user = quote_plus(os.getenv("CLIMATETRACE_USER"))
# password = quote_plus(os.getenv("CLIMATETRACE_PASS"))
# host = os.getenv("CLIMATETRACE_HOST")
# port = os.getenv("CLIMATETRACE_PORT")
# database = os.getenv("CLIMATETRACE_DB")

# postgres_url = f"postgresql://{user}:{password}@{host}:{port}/{database}"
# parquet_path = "data/emissions_reduction/gadm_2_emissions.parquet"

# # Use DuckDB to write directly from PostgreSQL to Parquet
# con = duckdb.connect()


# print('Running query')
# con.execute(f'''
#     INSTALL postgres;
#     LOAD postgres;

#     CREATE TABLE gadm_2_emissions_parquet AS
#     select extract(year from ge.start_time) as year 
#         , gb1.gadm_id gadm_1_id
#         , gb1.name gadm_1_name
#         , gb1.corrected_name gadm_1_corrected_name
#         , ge.gadm_id gadm_2_id
#         , gb2.name gadm_2_name
#         , gb2.corrected_name gadm_2_corrected_name
#         , gb2.admin_level
#         , ge.iso3_country
#         , ca.name as country_name
#         , ca.continent
#         , ca.eu
#         , ca.oecd
#         , ca.unfccc_annex
#         , ca.developed_un
#         , ca.em_finance
#         , asch.sector
#         , ge.original_inventory_sector subsector
#         , ge.gas
#         , sum(asset_activity) asset_activity
#         , sum(asset_emissions) asset_emissions
#         , sum(remainder_activity) remainder_activity
#         , sum(remainder_emissions) remainder_emissions
#         , sum(asset_emissions) + sum(remainder_emissions) as emissions_quantity

#     from postgres_scan('{postgres_url}','public', 'gadm_emissions') ge
#     inner join (
#         select distinct gadm_id
#             , immediate_parent
#             , name
#             , corrected_name
#             , admin_level
#         from postgres_scan('{postgres_url}','public', 'gadm_boundaries')
#         where admin_level = 2
#     ) as gb2
#         on ge.gadm_id = gb2.gadm_id
#     left join (
#         select distinct sector
#             , subsector
#         from postgres_scan('{postgres_url}','public', 'asset_schema')
#     ) asch
#         on cast(asch.subsector as varchar) = cast(ge.original_inventory_sector as varchar)
#     left join (
#         select gadm_id
#             , name
#             , corrected_name
#         from postgres_scan('{postgres_url}','public', 'gadm_boundaries')
#         where admin_level = 1
#     ) gb1
#         on gb1.gadm_id = gb2.immediate_parent
#     left join postgres_scan('{postgres_url}','public', 'country_analysis') ca
#         on cast(ca.iso3_country as varchar) = cast(ge.iso3_country as varchar)

#     where ge.gas = 'co2e_100yr'
#         and extract(year from start_time) = 2024
#         and ge.original_inventory_sector not in ('forest-land-clearing',
#                                                 'forest-land-degradation',
#                                                 'forest-land-fires',
#                                                 'net-forest-land',
#                                                 'net-shrubgrass',
#                                                 'net-wetland',
#                                                 'removals',
#                                                 'shrubgrass-fires',
#                                                 'water-reservoirs',
#                                                 'wetland-fires')

#     group by extract(year from ge.start_time)
#         , gb1.gadm_id 
#         , gb1.name
#         , gb1.corrected_name
#         , ge.gadm_id 
#         , gb2.name
#         , gb2.corrected_name
#         , gb2.admin_level
#         , ge.iso3_country
#         , ca.name
#         , ca.continent
#         , ca.eu
#         , ca.oecd
#         , ca.unfccc_annex
#         , ca.developed_un
#         , ca.em_finance
#         , asch.sector
#         , ge.original_inventory_sector
#         , ge.gas;

#     COPY gadm_2_emissions_parquet TO '{parquet_path}' (FORMAT PARQUET);
# ''')
# con.close()

# print('Complete')

In [10]:
# # --------------------------------------------------------- CITY BATCH -----------------------------------------------------------------

# import psycopg2
# from urllib.parse import quote_plus
# import pyarrow as pa
# import pyarrow.parquet as pq
# import os

# user = quote_plus(os.getenv("CLIMATETRACE_USER"))
# password = quote_plus(os.getenv("CLIMATETRACE_PASS"))
# host = os.getenv("CLIMATETRACE_HOST")
# port = os.getenv("CLIMATETRACE_PORT")
# database = os.getenv("CLIMATETRACE_DB")

# conn = psycopg2.connect(
#     dbname=database,
#     user=user,
#     password=password,
#     host=host,
#     port=port
# )

# cur = conn.cursor(name='parquet_cursor')  # server-side cursor

# cur.execute("""
#     SELECT extract(year from start_time) AS year,
#            ce.city_id,
#            cb.name AS city_name,
#            cb.corrected_name AS corrected_name,
#            ce.iso3_country,
#            ca.name AS country_name,
#            ca.continent,
#            ca.eu,
#            ca.oecd,
#            ca.unfccc_annex,
#            ca.developed_un,
#            ca.em_finance,
#            asch.sector,
#            ce.original_inventory_sector AS subsector,
#            SUM(asset_activity) AS asset_activity,
#            SUM(asset_emissions) AS asset_emissions,
#            SUM(remainder_activity) AS remainder_activity,
#            SUM(remainder_emissions) AS remainder_emissions,
#            SUM(asset_emissions) + SUM(remainder_emissions) AS emissions_quantity
#     FROM city_emissions ce
#     LEFT JOIN city_boundaries cb ON cb.city_id = ce.city_id
#     LEFT JOIN (
#         SELECT DISTINCT sector, subsector FROM asset_schema
#     ) asch ON CAST(asch.subsector AS varchar) = CAST(ce.original_inventory_sector AS varchar)
#     LEFT JOIN country_analysis ca ON CAST(ca.iso3_country AS varchar) = CAST(ce.iso3_country AS varchar)
#     WHERE extract(year FROM ce.start_time) = 2024
#       AND ce.gas = 'co2e_100yr'
#       AND ce.original_inventory_sector NOT IN (
#           'forest-land-clearing', 'forest-land-degradation', 'forest-land-fires',
#           'net-forest-land', 'net-shrubgrass', 'net-wetland', 'removals',
#           'shrubgrass-fires', 'water-reservoirs', 'wetland-fires'
#       )
#     GROUP BY extract(year FROM start_time),
#              ce.city_id, cb.name, cb.corrected_name,
#              ce.iso3_country, ca.name, ca.continent, ca.eu, ca.oecd,
#              ca.unfccc_annex, ca.developed_un, ca.em_finance,
#              asch.sector, ce.original_inventory_sector
# """)

# # Set up Parquet writer
# batch_size = 10000
# output_file = "data/emissions_reduction/city_emissions.parquet"
# batch_count = 0
# total_rows = 0

# print("executing city query...")

# # Fetch first batch
# rows = cur.fetchmany(batch_size)
# if not rows:
#     raise Exception("No data returned from query.")

# field_names = [desc[0] for desc in cur.description]
# first_table = pa.Table.from_pylist([dict(zip(field_names, row)) for row in rows])
# writer = pq.ParquetWriter(output_file, first_table.schema)
# writer.write_table(first_table)
# batch_count += 1
# total_rows += len(rows)
# print(f"Processed batch {batch_count} ({len(rows)} rows), total rows: {total_rows}")

# # Process remaining batches
# while True:
#     rows = cur.fetchmany(batch_size)
#     if not rows:
#         break

#     table = pa.Table.from_pylist([dict(zip(field_names, row)) for row in rows])
#     table = table.cast(writer.schema)  # ensure schema matches first batch
#     writer.write_table(table)

#     batch_count += 1
#     total_rows += len(rows)
#     print(f"Processed batch {batch_count} ({len(rows)} rows), total rows: {total_rows}")

# writer.close()
# cur.close()
# conn.close()
# print("Export complete.")


In [11]:
# import duckdb

# con = duckdb.connect()
# asset_annual_path = 'data/asset_annual_emissions/chunk_*.parquet'
# output_path = 'data/fixed_aluminum_moer.parquet'

# con.execute(f"""
#                     copy(
#                         select year
#                             , asset_id
#                             , asset_type
#                             , asset_name
#                             , iso3_country
#                             , country_name
#                             , continent
#                             , eu
#                             , oecd
#                             , unfccc_annex
#                             , developed_un
#                             , em_finance
#                             , sector
#                             , subsector
#                             , gadm_1
#                             , gadm_2
#                             , ghs_fua
#                             , city_id
#                             , other1
#                             , other2
#                             , other3
#                             , other4
#                             , other5
#                             , other6
#                             , other7
#                             , other8
#                             , other9
#                             , other10
#                             , activity_units
#                             , capacity
#                             , activity
#                             , average_emissions_factor
#                             , emissions_quantity
#                             , case when subsector = 'aluminum' then null else ae.ef_moer end as ef_moer
#                             , case when subsector = 'aluminum' then null else ae.eq_12 end as eq_12
#                             , case when subsector = 'aluminum' then null else ae.ef_12 end as ef_12
#                             , case when subsector = 'aluminum' then null else ae.eq_12_moer end as eq_12_moer
#                             , case when ae.subsector = 'aluminum' then null else ef_12_moer end as ef_12_moer
#                             , asset_type_2
                        
#                         from '{asset_annual_path}' ae
#                     ) to '{output_path}' (format 'parquet');
#                 """)



In [None]:
# import pandas as pd

# # Replace with your CSV file path
# csv_file = "/Users/anthonyrusso/Dev/emissions-reduction-pathways-dashboard/data/static/ct_percentile_40sectors_moer_stat_industrial_20250824.csv"
# parquet_file = "/Users/anthonyrusso/Dev/emissions-reduction-pathways-dashboard/data/static/ct_percentile_40sectors_moer_stat_industrial_20250824.parquet"

# # Read CSV
# df = pd.read_csv(csv_file)

# # Save as Parquet
# df.to_parquet(parquet_file, index=False)

# print(f"Converted {csv_file} to {parquet_file}")

Converted /Users/anthonyrusso/Dev/emissions-reduction-pathways-dashboard/data/static/ct_percentile_40sectors_moer_stat_industrial_20250824.csv to /Users/anthonyrusso/Dev/emissions-reduction-pathways-dashboard/data/static/ct_percentile_40sectors_moer_stat_industrial_20250824.parquet


In [None]:
import duckdb
import pandas as pd
from config import CONFIG
import os

# Get the path to the annual asset parquet files
annual_asset_path = CONFIG['annual_asset_path']

# Connect to DuckDB
con = duckdb.connect()

# Define your SQL query
query = f'''
                WITH sector_mapping as (
                    SELECT distinct sector
                        , subsector

                    FROM '{annual_asset_path}'
                ),
                
                induced as (
                    SELECT sector
                        , sum(induced_emissions) as induced_emissions

                    FROM (
                            SELECT
                                sector,
                                sum(induced_emissions) AS induced_emissions
                            FROM (
                                SELECT distinct asset_id
                                    , sector_mapping.sector
                                    , induced_sector_1 AS induced_subsector
                                    , induced_sector_1_induced_emissions AS induced_emissions
                                FROM '{annual_asset_path}' aap
                                LEFT JOIN sector_mapping
                                    on sector_mapping.subsector = aap.induced_sector_1
                                WHERE induced_sector_1 IS NOT NULL
                            )  

                            group by sector
                            
                            UNION ALL
                                
                            SELECT
                                sector,
                                sum(induced_emissions) AS induced_emissions
                            FROM (
                                SELECT distinct asset_id
                                    , sector_mapping.sector
                                    , induced_sector_2 AS induced_subsector
                                    , induced_sector_2_induced_emissions AS induced_emissions
                                FROM '{annual_asset_path}' aap
                                LEFT JOIN sector_mapping
                                    on sector_mapping.subsector = aap.induced_sector_2
                                WHERE induced_sector_2 IS NOT NULL
                            )  

                            group by sector
                                
                            UNION ALL
                                
                            SELECT
                                sector,
                                sum(induced_emissions) AS induced_emissions
                            FROM (
                                SELECT distinct asset_id
                                    , sector_mapping.sector
                                    , induced_sector_3 AS induced_subsector
                                    , induced_sector_3_induced_emissions AS induced_emissions
                                FROM '{annual_asset_path}' aap
                                LEFT JOIN sector_mapping
                                    on sector_mapping.subsector = aap.induced_sector_3
                                WHERE induced_sector_3 IS NOT NULL
                            )  

                            group by sector
                        )

                    GROUP BY sector
                ),

                asset_reductions as (
                    SELECT sector
                        , sum(emissions_quantity) emissions_quantity
                        , sum(emissions_reduced_at_asset) emissions_reduced_at_asset
                
                    FROM (
                        SELECT asset_id
                            , sector
                            , subsector
                            , sum(emissions_quantity) emissions_quantity
                            , emissions_reduced_at_asset

                        FROM '{annual_asset_path}'

                        GROUP BY asset_id
                            , sector
                            , subsector
                            , emissions_reduced_at_asset
                    ) asset

                    GROUP BY sector
                )

                SELECT 
                    COALESCE(ar.sector, induced.sector) AS sector,
                    ar.emissions_quantity,
                    induced.induced_emissions,
                    ar.emissions_reduced_at_asset,
                    
                    CASE 
                        WHEN COALESCE(induced.induced_emissions, 0) > COALESCE(ar.emissions_reduced_at_asset, 0)
                        THEN COALESCE(induced.induced_emissions, 0) - COALESCE(ar.emissions_reduced_at_asset, 0)
                        ELSE 0 
                    END AS induced_emissions,
                    
                    CASE 
                        WHEN COALESCE(induced.induced_emissions, 0) < COALESCE(ar.emissions_reduced_at_asset, 0)
                        THEN COALESCE(ar.emissions_reduced_at_asset, 0) - COALESCE(induced.induced_emissions, 0)
                        ELSE 0 
                    END AS emissions_reduction_potential

                FROM asset_reductions ar
                FULL OUTER JOIN induced
                    on induced.sector = ar.sector
            '''

# Execute and fetch as DataFrame
df = con.execute(query).df()

# Define output path
output_path = os.path.join("/Users/anthonyrusso/Dev/emissions-reduction-pathways-dashboard/data", "sector_emissions_reduction_summary_2024.csv")

# Write to CSV
df.to_csv(output_path, index=False)

print(f"Saved CSV to {output_path}")


Saved CSV to /Users/anthonyrusso/Dev/emissions-reduction-pathways-dashboard/data/sector_emissions_reduction_summary_2024.csv


In [None]:
import duckdb
import pandas as pd
from config import CONFIG
import os

# Get the path to the annual asset parquet files
annual_asset_path = CONFIG['annual_asset_path']

# Connect to DuckDB
con = duckdb.connect()

# Define your SQL query
query = f'''
        
        select asset_id
                , strategy_id
                , sum(emissions_quantity) emissions_quantity
                , total_emissions_reduced_per_year
                , emissions_reduced_at_asset
                , induced_sector_1_induced_emissions
                , induced_sector_2_induced_emissions
                , induced_sector_3_indued_emissions

        from '{annual_asset_path}'

        group by asset_id
                , strategy_id
                , total_emissions_reduced_per_year
                , emissions_reduced_at_asset
                , induced_sector_1_induced_emissions
                , induced_sector_2_induced_emissions
                , induced_sector_3_indued_emissions

        where induced_sector_1 = 'electricity-generation'
                or induced_sector_2 = 'electricity-generation'
                or induced_sector_3 = 'electricity-generation'
        

        '''

# Execute and fetch as DataFrame
df = con.execute(query).df()

# Define output path
output_path = os.path.join("/Users/anthonyrusso/Dev/emissions-reduction-pathways-dashboard/data", "induced_electricity_2024.csv")

# Write to CSV
df.to_csv(output_path, index=False)

print(f"Saved CSV to {output_path}")
