<center><h1>Deep Learning Pipeline</h1></center>

In [None]:
# Data Manipulation
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

# System Settings
import warnings
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
sys.path.append(os.path.abspath('../atmoseer'))
sys.path.append(os.path.abspath('../utils'))
warnings.filterwarnings("ignore")

# Custom Database Operations
from utils.postgres_processor import load_table

# Deep Learning Operations
import torch
from atmoseer.atmoseer_core import BayesianTuner
from atmoseer.preprocessors.atmoseer_preprocessor import AtmoSeerPreprocessor
from atmoseer.configs.atmoseer_config import BayesianTunerConfig
from atmoseer.evaluation.atmoseer_eval import AtmoSeerEvaluator

# Initialze the GPU if available, otherwise fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

<center><h1>Carbon Dioxide (CO<sub>2</sub>)</h1></center>

In [2]:
co2_df = load_table("CO2DataNOAA", "postgres", "Godragons6")
print(co2_df.shape)
co2_df.head().style.format(precision=2).set_properties(**{'border': '1.5px solid blue'}).background_gradient(cmap='coolwarm')

(201850, 17)


Unnamed: 0,date,site,ppm,latitude,longitude,altitude,year,month,day,season,co2_change_rate,month_sin,month_cos,ppm_lag_14,ppm_lag_30,ppm_lag_365,biomass_density
0,1968-01-16,NWR,324.38,40.05,-105.63,3526.0,1968,1,16,Winter,1.5,0.5,0.87,324.38,324.38,324.38,328352893.44
1,1968-01-17,NWR,323.69,40.05,-105.63,3526.0,1968,1,17,Winter,-0.16,0.5,0.87,324.38,324.38,324.38,328352893.44
2,1968-02-29,NWR,325.53,40.05,-105.63,3526.0,1968,2,29,Winter,1.84,0.87,0.5,324.38,324.38,324.38,736124172.48
3,1968-03-07,NWR,326.49,40.05,-105.63,3526.0,1968,3,7,Spring,0.96,1.0,0.0,324.38,324.38,324.38,1143895451.52
4,1968-03-14,NWR,326.09,40.05,-105.63,3526.0,1968,3,14,Spring,-0.4,1.0,0.0,324.38,324.38,324.38,1143895451.52


## Preprocessing

In [4]:
preprocessor = AtmoSeerPreprocessor()
co2_dataloaders = preprocessor.prepare_data(co2_df)

print("Initializing tuner")

tuner_config = BayesianTunerConfig(gas_type='co2')

tuner = BayesianTuner(
    train_loader=co2_dataloaders['train_loader'],
    val_loader=co2_dataloaders['val_loader'],
    config=tuner_config
)

Missing value counts:
biomass_density    6090
dtype: int64
Initializing tuner


## Train & Tune

In [5]:
print(f"Starting optimization process ({tuner_config.n_trials} trials)...")
best_params, best_loss = tuner.optimize()

print("\nOptimization completed!")
print(f"Best validation loss: {best_loss:.6f}")
print("\nBest parameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

Starting optimization process (50 trials)...

Trial 1/50
Epoch: 0 
New best validation loss: 0.102639 for current trial
Epoch: 1 
New best validation loss: 0.102153 for current trial
Epoch: 9 
New best validation loss: 0.081684 for current trial
Epoch: 10 
New best validation loss: 0.075768 for current trial
Epoch: 13 
New best validation loss: 0.074357 for current trial
Epoch: 17 
New best validation loss: 0.069326 for current trial
Epoch: 19 
New best validation loss: 0.068806 for current trial
Early stopping triggered at epoch 29. Best epoch was 19 with validation loss 0.068806

Trial 2/50
Epoch: 0 
New best validation loss: 0.130761 for current trial
Epoch: 1 
New best validation loss: 0.102792 for current trial
Epoch: 2 
New best validation loss: 0.102607 for current trial
Epoch: 4 
New best validation loss: 0.102560 for current trial
Epoch: 7 
New best validation loss: 0.102542 for current trial
Early stopping triggered at epoch 17. Best epoch was 7 with validation loss 0.102542


## Forecasting

This method implements an iterative forecasting process where each prediction becomes part of the input for the next 
prediction. It uses Monte Carlo sampling with added Gaussian noise to estimate prediction uncertainty, which naturally 
grows over time as predictions are chained together. Essentially the Monte Carlo sampling accounts for the inherent randomness in time series data, as real life values often flucuate within a certain range, rather than an exact y=x relationship, even if the overall trend is linear. Instead of making a prediction based off of one single point, this 
forecast method will create a normal distribution around a specific prediction point using 100 normally distributed values, 
where the mean is the prediction point and the standard deviation is the noise_scale. This will create a range of possible 
values that the prediction could be, which will be used to create the uncertainty bounds. The further out into the future 
that the predictions go, the wider the uncertainty bounds become. The Bayesian Tuner will go through many trials to find 
the optimal sequence length (lookback window in days) and then this forecast method will take that sequence length and use 
it to generate predictions. For dates that are past this sequence length, the predicted values will be entirely based on 
other predicted values (not trained data points), which will increase the uncertainty by a larger and larger amount.

In [None]:
from datetime import timedelta
import plotly.graph_objects as go

atmoseer_co2 = BayesianTuner.load_best_model(gas_type='co2', device=device)

# Initialize preprocessor and prepare the most recent sequence
preprocessor = AtmoSeerPreprocessor()
processed_data = preprocessor.prepare_data(
    co2_df, 
    seq_length=atmoseer_co2.model_config.sequence_length,
    batch_size=atmoseer_co2.train_config.batch_size
)

# Get the most recent sequence
test_loader = processed_data['test_loader']
last_sequence = next(iter(test_loader))[0][-1:]  # Shape: [1, sequence_length, features]

atmoseer_co2.eval()

# Generate 1-year forecast (365 days)
forecast = atmoseer_co2.generate_forecast(
    initial_sequence=last_sequence,
    forecast_length=365,
    confidence_interval=0.95,
    noise_scale=0.1,
    device=device
)

# Inverse transform predictions back to original scale
target_scaler = processed_data['target_scaler']
forecast_unscaled = {
    'predictions': target_scaler.inverse_transform(forecast['predictions'].reshape(-1, 1)).flatten(),
    'upper_bound': target_scaler.inverse_transform(forecast['upper_bound'].reshape(-1, 1)).flatten(),
    'lower_bound': target_scaler.inverse_transform(forecast['lower_bound'].reshape(-1, 1)).flatten()
}

# Create date range for forecast
last_date = co2_df['date'].max()
forecast_dates = pd.date_range(
    start=last_date + timedelta(days=1),
    periods=365,
    freq='D'
)

# Create the plot
fig = go.Figure()

# Add historical data (last 365 days)
historical_data = co2_df.tail(365)
fig.add_trace(go.Scatter(
    x=historical_data['date'],
    y=historical_data['ppm'],
    name='Historical',
    line=dict(color='#00B5F7', width=1.5)
))

# Add forecast
fig.add_trace(go.Scatter(
    x=forecast_dates,
    y=forecast_unscaled['predictions'],
    name='Forecast',
    line=dict(color='#32CD32', width=1.5)
))

# Add confidence intervals
fig.add_trace(go.Scatter(
    x=forecast_dates,
    y=forecast_unscaled['upper_bound'],
    fill=None,
    mode='lines',
    line=dict(color='rgba(50, 205, 50, 0)'),
    showlegend=False
))

fig.add_trace(go.Scatter(
    x=forecast_dates,
    y=forecast_unscaled['lower_bound'],
    fill='tonexty',
    mode='lines',
    line=dict(color='rgba(50, 205, 50, 0)'),
    fillcolor='rgba(50, 205, 50, 0.2)',
    showlegend=False
))

# Update layout
fig.update_layout(
    title='CO2 Concentration Forecast',
    template='plotly_dark',
    xaxis_title='Date',
    yaxis_title='CO2 (ppm)',
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01,
        bgcolor='rgba(0,0,0,0.5)'
    ),
    margin=dict(l=20, r=20, t=40, b=20),
    showlegend=True,
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)'
)

fig.show()

# Print some key metrics
print("\nForecast Summary:")
print(f"Current CO2 Level: {historical_data['ppm'].iloc[-1]:.2f} ppm")
print(f"Forecasted CO2 Level (1 year): {forecast_unscaled['predictions'][-1]:.2f} ppm")
print(f"Predicted Annual Increase: {forecast_unscaled['predictions'][-1] - historical_data['ppm'].iloc[-1]:.2f} ppm")
print(f"\nConfidence Interval (End of Year):")
print(f"Lower Bound: {forecast_unscaled['lower_bound'][-1]:.2f} ppm")
print(f"Upper Bound: {forecast_unscaled['upper_bound'][-1]:.2f} ppm")

# Save predictions to DataFrame
forecast_df = pd.DataFrame({
    'date': forecast_dates,
    'predicted_ppm': forecast_unscaled['predictions'],
    'lower_bound': forecast_unscaled['lower_bound'],
    'upper_bound': forecast_unscaled['upper_bound']
})

# Display first few rows of predictions
print("\nDetailed Forecast (First 10 days):")
print(forecast_df.head(10))

## Evaluation

In [None]:
# Load and preprocess data
co2_df = load_table("CO2DataNOAA", "postgres", "Godragons6")
preprocessor = AtmoSeerPreprocessor()
data_loaders = preprocessor.prepare_data(co2_df)

# Load best model and move to GPU
atmoseer_co2 = BayesianTuner.load_best_model(gas_type='co2', device=device)

# Create evaluator with GPU device
evaluator = AtmoSeerEvaluator(atmoseer_co2, data_loaders, device=device)

# Get metrics
metrics = evaluator.evaluate()
print("\nMetrics:")
for dataset, dataset_metrics in metrics.items():
    print(f"\n{dataset.title()} Set Metrics:")
    for metric, value in dataset_metrics.items():
        print(f"{metric.upper()}: {value:.4f}")

# Create visualization
dates = pd.date_range(co2_df['date'].min(), co2_df['date'].max())
fig = evaluator.plot_results(gas_type='co2', dates=dates)
fig.show()

<center><h1>Methane (CH<sub>4</sub>)</h1></center>

In [None]:
ch4_df = load_table("CH4DataNOAA", "postgres", "Godragons6")
print(ch4_df.shape)
ch4_df.head().style.format(precision=2).set_properties(**{'border': '1.5px solid blue'}).background_gradient(cmap='coolwarm')

## Preprocessing

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

## Train & Tune

## Test

## Evaluation