<center><h1>Deep Learning Pipeline</h1></center>

In [1]:
# Data Manipulation
import pandas as pd
from datetime import timedelta

pd.set_option('display.max_columns', None)

# System Settings
import warnings
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
sys.path.append(os.path.abspath('../atmoseer'))
sys.path.append(os.path.abspath('../utils'))
warnings.filterwarnings("ignore")

# Custom Database Operations
from utils.postgres_processor import load_table
from utils.ppm_lookup import NOAALookup

# Deep Learning Operations
import torch
from atmoseer.atmoseer_core import BayesianTuner
from atmoseer.preprocessors.atmoseer_preprocessor import AtmoSeerPreprocessor
from atmoseer.configs.atmoseer_config import BayesianTunerConfig
from atmoseer.evaluation.atmoseer_eval import AtmoSeerEvaluator

# Initialze the GPU if available, otherwise fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


<center><h1>Carbon Dioxide (CO<sub>2</sub>)</h1></center>

In [2]:
co2_df = load_table("CO2DataNOAA", "postgres", "Godragons6")
print(co2_df.shape)
co2_df.head().style.format(precision=2).set_properties(**{'border': '1.5px solid blue'}).background_gradient(cmap='coolwarm')

(201850, 17)


Unnamed: 0,date,site,ppm,latitude,longitude,altitude,year,month,day,season,co2_change_rate,month_sin,month_cos,ppm_lag_14,ppm_lag_30,ppm_lag_365,biomass_density
0,1968-01-16,NWR,324.38,40.05,-105.63,3526.0,1968,1,16,Winter,1.5,0.5,0.87,324.38,324.38,324.38,328352893.44
1,1968-01-17,NWR,323.69,40.05,-105.63,3526.0,1968,1,17,Winter,-0.16,0.5,0.87,324.38,324.38,324.38,328352893.44
2,1968-02-29,NWR,325.53,40.05,-105.63,3526.0,1968,2,29,Winter,1.84,0.87,0.5,324.38,324.38,324.38,736124172.48
3,1968-03-07,NWR,326.49,40.05,-105.63,3526.0,1968,3,7,Spring,0.96,1.0,0.0,324.38,324.38,324.38,1143895451.52
4,1968-03-14,NWR,326.09,40.05,-105.63,3526.0,1968,3,14,Spring,-0.4,1.0,0.0,324.38,324.38,324.38,1143895451.52


In [11]:
lookup = NOAALookup(co2_df)

# Print date range info
print(f"Data available from {lookup.earliest_date} to {lookup.latest_date}")

# Get all available measurement sites
sites = lookup.get_available_sites()
print("\nAvailable measurement sites:")
for site in sites:
    print(f"Site: {site['site']}")
    print(f"Location: {site['latitude']}, {site['longitude']}, altitude: {site['altitude']}m")
    print(f"Data range: {site['date_range']}\n")

Data available from 1968-01-16 to 2024-05-31

Available measurement sites:
Site: ABP
Location: -12.76, -38.16, altitude: 6.0m
Data range: 2006-10-27 to 2010-01-13

Site: ALT
Location: 82.4508, -62.5072, altitude: 190.0m
Data range: 1985-06-17 to 2023-12-27

Site: AMS
Location: -37.95, 77.53, altitude: 153.0m
Data range: 1982-03-07 to 1990-11-12

Site: AMT
Location: 45.0345, -68.6821, altitude: 160.4m
Data range: 2003-09-19 to 2024-05-31

Site: AMY
Location: 36.5389, 126.3295, altitude: 87.0m
Data range: 2013-12-03 to 2023-11-30

Site: ASC
Location: -7.9667, -14.4, altitude: 87.0m
Data range: 1979-08-27 to 2023-12-31

Site: ASK
Location: 23.2625, 5.6322, altitude: 2715.0m
Data range: 1995-09-12 to 2023-12-31

Site: AVI
Location: 17.75, -64.75, altitude: 5.0m
Data range: 1979-03-03 to 1990-08-29

Site: AZR
Location: 38.75, -27.08, altitude: 22.0m
Data range: 1979-12-31 to 2022-03-21

Site: BAL
Location: 55.5, 16.67, altitude: 28.0m
Data range: 1992-09-03 to 2011-06-22

Site: BAO
Location

In [None]:
date_records = lookup.lookup_date('1968-02-01')
print("\nMeasurements for 1968-02-01 (or nearest date):")
for record in date_records:
    print(f"Site: {record.site}")
    print(f"PPM: {record.ppm}")
    print(f"Location: {record.latitude}, {record.longitude}\n")

Note: No data for 1968-02-01. Using nearest available date: 1968-01-17

Measurements for 1968-02-01 (or nearest date):
Site: NWR
PPM: 323.69
Location: 40.05, -105.63



In [None]:
dict_records = lookup.lookup_date('1968-02-01', as_dict=True)
print("\nData as dictionary:")
for record in dict_records:
    print(record)

Note: No data for 1968-02-01. Using nearest available date: 1968-01-17

Data as dictionary:
{'date': datetime.date(1968, 1, 17), 'site': 'NWR', 'ppm': 323.69, 'latitude': 40.05, 'longitude': -105.63, 'altitude': 3526.0}


In [None]:
range_records = lookup.lookup_range('1968-02-01', '1969-05-31')
print("\nMeasurements for 1968-02-01' - '1969-05-31:")
for date, records in range_records.items():
    print(f"\nDate: {date}")
    for record in records:
        print(f"Site: {record.site}, PPM: {record.ppm}")


Measurements for 1968-02-01' - '1969-05-31:

Date: 1968-02-29
Site: NWR, PPM: 325.53

Date: 1968-03-07
Site: NWR, PPM: 326.49

Date: 1968-03-14
Site: NWR, PPM: 326.09

Date: 1968-03-22
Site: NWR, PPM: 325.98

Date: 1968-03-28
Site: NWR, PPM: 326.26

Date: 1968-04-11
Site: NWR, PPM: 326.75

Date: 1968-04-30
Site: NWR, PPM: 326.65

Date: 1968-05-14
Site: NWR, PPM: 326.16

Date: 1968-05-21
Site: NWR, PPM: 327.41

Date: 1968-05-28
Site: NWR, PPM: 326.47

Date: 1968-06-06
Site: NWR, PPM: 323.94

Date: 1968-06-13
Site: NWR, PPM: 325.25

Date: 1968-07-03
Site: NWR, PPM: 320.33

Date: 1968-08-07
Site: NWR, PPM: 318.2

Date: 1968-10-03
Site: NWR, PPM: 320.09

Date: 1968-11-07
Site: NWR, PPM: 322.49

Date: 1968-11-26
Site: NWR, PPM: 323.76

Date: 1968-11-29
Site: STC, PPM: 324.32

Date: 1968-12-10
Site: NWR, PPM: 324.6

Date: 1968-12-17
Site: NWR, PPM: 324.48

Date: 1968-12-21
Site: STC, PPM: 324.88

Date: 1968-12-23
Site: NWR, PPM: 323.7

Date: 1969-01-01
Site: NWR, PPM: 324.6

Date: 1969-01-0

## Preprocessing

In [3]:
preprocessor = AtmoSeerPreprocessor()
co2_dataloaders = preprocessor.prepare_data(co2_df)

print("Initializing tuner")

tuner_config = BayesianTunerConfig(gas_type='co2')

tuner = BayesianTuner(
    train_loader=co2_dataloaders['train_loader'],
    val_loader=co2_dataloaders['val_loader'],
    config=tuner_config
)

Missing value counts:
biomass_density    6090
dtype: int64
Initializing tuner


## Train & Tune

In [4]:
print(f"Starting optimization process ({tuner_config.n_trials} trials)...")
best_params, best_loss = tuner.optimize()

print("\nOptimization completed!")
print(f"Best validation loss: {best_loss:.6f}")
print("\nBest parameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

Starting optimization process (25 trials)...

Trial 1/25
Epoch: 0 
New best validation loss: 0.098067 for current trial
Epoch: 1 
New best validation loss: 0.092317 for current trial
Epoch: 3 
New best validation loss: 0.091035 for current trial
Epoch: 4 
New best validation loss: 0.079872 for current trial
Epoch: 5 
New best validation loss: 0.079418 for current trial
Epoch: 6 
New best validation loss: 0.076297 for current trial
Epoch: 7 
New best validation loss: 0.075524 for current trial
Epoch: 9 
New best validation loss: 0.070546 for current trial
Epoch: 16 
New best validation loss: 0.065076 for current trial
Early stopping triggered at epoch 28. Best epoch was 16 with validation loss 0.065076

Trial 2/25
Epoch: 0 
New best validation loss: 0.103733 for current trial
Epoch: 2 
New best validation loss: 0.103329 for current trial
Early stopping triggered at epoch 14. Best epoch was 2 with validation loss 0.103329

Trial 3/25
Epoch: 0 
New best validation loss: 0.108196 for curre

KeyboardInterrupt: 

## Forecasting

This method implements an iterative forecasting process where each prediction becomes part of the input for the next 
prediction. It uses Monte Carlo sampling with added Gaussian noise to estimate prediction uncertainty, which naturally 
grows over time as predictions are chained together. Essentially the Monte Carlo sampling accounts for the inherent randomness in time series data, as real life values often flucuate within a certain range, rather than an exact y=x relationship, even if the overall trend is linear. Instead of making a prediction based off of one single point, this 
forecast method will create a normal distribution around a specific prediction point using 100 normally distributed values, 
where the mean is the prediction point and the standard deviation is the noise_scale. This will create a range of possible 
values that the prediction could be, which will be used to create the uncertainty bounds. The further out into the future 
that the predictions go, the wider the uncertainty bounds become. The Bayesian Tuner will go through many trials to find 
the optimal sequence length (lookback window in days) and then this forecast method will take that sequence length and use 
it to generate predictions. For dates that are past this sequence length, the predicted values will be entirely based on 
other predicted values (not trained data points), which will increase the uncertainty by a larger and larger amount.

In [None]:
atmoseer_co2 = BayesianTuner.load_best_model(gas_type='co2', device=device)

## Evaluation

In [None]:
# Load and preprocess data
co2_df = load_table("CO2DataNOAA", "postgres", "Godragons6")
preprocessor = AtmoSeerPreprocessor()
co2_data_loaders = preprocessor.prepare_data(co2_df)

# Load best model and move to GPU
atmoseer_co2 = BayesianTuner.load_best_model(gas_type='co2', device=device)

# Create evaluator with GPU device
evaluator = AtmoSeerEvaluator(atmoseer_co2, co2_data_loaders, device=device)

# Get metrics
metrics = evaluator.evaluate()
print("\nMetrics:")
for dataset, dataset_metrics in metrics.items():
    print(f"\n{dataset.title()} Set Metrics:")
    for metric, value in dataset_metrics.items():
        print(f"{metric.upper()}: {value:.4f}")

# Create visualization
dates = pd.date_range(co2_df['date'].min(), co2_df['date'].max())
fig = evaluator.plot_results(gas_type='co2', dates=dates)
fig.show()

<center><h1>Methane (CH<sub>4</sub>)</h1></center>

In [None]:
ch4_df = load_table("CH4DataNOAA", "postgres", "Godragons6")
print(ch4_df.shape)
ch4_df.head().style.format(precision=2).set_properties(**{'border': '1.5px solid blue'}).background_gradient(cmap='coolwarm')

## Preprocessing

In [None]:
preprocessor = AtmoSeerPreprocessor()
ch4_dataloaders = preprocessor.prepare_data(ch4_df)

print("Initializing tuner")

tuner_config = BayesianTunerConfig(gas_type='ch4')

tuner = BayesianTuner(
    train_loader=ch4_dataloaders['train_loader'],
    val_loader=ch4_dataloaders['val_loader'],
    config=tuner_config
)

## Train & Tune

In [None]:
print(f"Starting optimization process ({tuner_config.n_trials} trials)...")
best_params, best_loss = tuner.optimize()

print("\nOptimization completed!")
print(f"Best validation loss: {best_loss:.6f}")
print("\nBest parameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

## Forecasting

## Evaluation