# ML Training Data: Continuous pCO2 Data Only

## Overview
Create training datasets using ONLY continuous pCO2 measurements from buoy locations. This notebook:
1. Loads the continuous data periods analysis
2. Filters satellite and buoy data to match continuous windows
3. Creates a 4 km spatial grid around each buoy
4. Builds final training table with: latitude, longitude, date, pCO2, satellite_SST
5. Exports ready-for-ML datasets

**Goal:** High-quality training data with 100% measured values (no interpolation)

**Note:** Edit the Configuration section below to choose your training approach.

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("✓ All libraries imported successfully!")

## Configuration: Choose Your Training Approach

Edit the settings below to select which data windows to use for model training.

In [None]:
# CONFIGURATION: Edit these settings to choose your training approach

# === APPROACH SELECTION ===
# Choose ONE of the following:
#
# Option A: Single Best Location (depth)
#   APPROACH = 'single'
#   SELECTED_LOCATIONS = ['Southern California']
#
# Option B: Top 3 Locations (balance)
#   APPROACH = 'multi'
#   SELECTED_LOCATIONS = ['Southern California', 'LA Buoy', 'First Landing']
#
# Option C: All Viable Locations (breadth)
#   APPROACH = 'all'
#   SELECTED_LOCATIONS = None  # Will auto-select all with continuous data

# YOUR CHOICE (edit this):
APPROACH = 'single'  # Change to 'multi' or 'all' as desired
SELECTED_LOCATIONS = ['Southern California']  # Edit for your approach

# === GRID CONFIGURATION ===
GRID_RADIUS_KM = 4  # 4 km radius around each buoy (as specified)

# === FILE PATHS ===
DATA_PATH = 'Data/'
CONTINUOUS_DATA_FILE = DATA_PATH + 'buoy_continuous_data_periods.csv'
SATELLITE_FILE = DATA_PATH + 'satellite_sst_cleaned.csv'
BUOY_FILE = DATA_PATH + 'buoy_data_cleaned.csv'

print("=" * 80)
print("CONFIGURATION LOADED")
print("=" * 80)
print(f"\nApproach: {APPROACH.upper()}")
print(f"Locations: {SELECTED_LOCATIONS if SELECTED_LOCATIONS else 'All viable'}")
print(f"Grid radius: {GRID_RADIUS_KM} km")
print("\nTo change these settings, edit the cells above and re-run this cell.")
print("=" * 80)

## Step 1: Load Data Files

Load the analysis results and raw data files needed for filtering.

In [None]:
# Step 1: Load data files

print("Loading data files...\n")

# Load continuous data periods analysis
continuous_periods = pd.read_csv(CONTINUOUS_DATA_FILE)
print(f"✓ Loaded continuous data periods: {len(continuous_periods)} segments")

# Load satellite and buoy master files
satellite_data = pd.read_csv(SATELLITE_FILE)
satellite_data['datetime'] = pd.to_datetime(satellite_data['datetime'])
print(f"✓ Loaded satellite data: {len(satellite_data):,} records")

buoy_data = pd.read_csv(BUOY_FILE)
buoy_data['datetime'] = pd.to_datetime(buoy_data['datetime'])
print(f"✓ Loaded buoy data: {len(buoy_data):,} records")

print("\n" + "=" * 80)
print("DATA FILES LOADED SUCCESSFULLY")
print("=" * 80)

## Step 2: Select Training Windows

Choose which continuous data periods to use based on your configuration.

In [None]:
# Step 2: Select training windows based on approach

print("Selecting training windows...\n")

if APPROACH == 'all':
    # Use all available continuous segments
    selected_segments = continuous_periods.copy()
    print(f"Using ALL continuous data segments ({len(selected_segments)} total)")
else:
    # Use only selected locations
    selected_segments = continuous_periods[
        continuous_periods['location'].isin(SELECTED_LOCATIONS)
    ].copy()
    print(f"Using continuous segments for: {', '.join(SELECTED_LOCATIONS)}")

# Show the selected windows
print("\nSelected Training Windows:")
print("-" * 100)
for idx, row in selected_segments.iterrows():
    print(f"  {row['location']:<20} | {row['segment_start']} to {row['segment_end']} " +
          f"| {row['days']:4.0f} days | {row['records']:4.0f} records")

print("\n" + "=" * 100)
print(f"TOTAL RECORDS AVAILABLE: {selected_segments['records'].sum():,}")
print("=" * 100)

## Step 3: Filter Data to Continuous Periods

Extract only the data from continuous pCO2 windows (no interpolation, no gaps).

In [None]:
# Step 3: Filter buoy and satellite data to continuous periods

print("Filtering data to continuous periods...\n")

# Create filter: keep only data within selected date ranges and locations
filtered_rows = []

for idx, segment in selected_segments.iterrows():
    location = segment['location']
    start_date = pd.to_datetime(segment['segment_start'])
    end_date = pd.to_datetime(segment['segment_end'])
    
    # Filter buoy data for this segment
    segment_buoy = buoy_data[
        (buoy_data['location'] == location) &
        (buoy_data['datetime'] >= start_date) &
        (buoy_data['datetime'] <= end_date) &
        (buoy_data['pco2_sw_sat'].notna())  # Keep only measured pCO2, not NaN
    ].copy()
    
    filtered_rows.append(segment_buoy)
    print(f"  {location}: {len(segment_buoy):,} measured pCO2 records")

# Combine all filtered segments
buoy_filtered = pd.concat(filtered_rows, ignore_index=True)
print(f"\nTotal filtered buoy records: {len(buoy_filtered):,}")

# Filter satellite data to match the date range of filtered buoy data
satellite_filtered = satellite_data[
    (satellite_data['datetime'] >= buoy_filtered['datetime'].min()) &
    (satellite_data['datetime'] <= buoy_filtered['datetime'].max())
].copy()

print(f"Satellite records in matching time range: {len(satellite_filtered):,}")

print("\n" + "=" * 100)
print("DATA FILTERED TO CONTINUOUS PERIODS - 100% MEASURED VALUES")
print("=" * 100)

## Step 4: Create Spatial Grid Around Buoys

Define a 4 km grid cell for each buoy location to organize satellite data spatially.

In [None]:
# Step 4: Create spatial grid cells around each buoy
# Conversion: 1 degree latitude = ~111 km, so 4 km = ~0.036 degrees

print("Creating spatial grid cells...\n")

# Convert km radius to degrees (simplified: 1 degree ≈ 111 km)
grid_radius_degrees = GRID_RADIUS_KM / 111.0

# Create grid definitions for each buoy location
grid_definitions = []

for location in selected_segments['location'].unique():
    # Get buoy coordinates (same for all measurements of this location)
    loc_data = buoy_filtered[buoy_filtered['location'] == location]
    if len(loc_data) == 0:
        continue
    
    lat = loc_data['latitude'].iloc[0]
    lon = loc_data['longitude'].iloc[0]
    
    # Define grid bounds
    grid_def = {
        'location': location,
        'center_lat': lat,
        'center_lon': lon,
        'lat_min': lat - grid_radius_degrees,
        'lat_max': lat + grid_radius_degrees,
        'lon_min': lon - grid_radius_degrees,
        'lon_max': lon + grid_radius_degrees,
        'grid_id': f"{location}_{lat:.2f}_{lon:.2f}"
    }
    grid_definitions.append(grid_def)
    
    print(f"  {location}")
    print(f"    Center: {lat:.4f}°N, {lon:.4f}°W")
    print(f"    Bounds: Lat [{grid_def['lat_min']:.4f}, {grid_def['lat_max']:.4f}], " +
          f"Lon [{grid_def['lon_min']:.4f}, {grid_def['lon_max']:.4f}]")
    print()

grid_df = pd.DataFrame(grid_definitions)

print("=" * 100)
print(f"SPATIAL GRID CREATED FOR {len(grid_df)} LOCATIONS (4 km radius each)")
print("=" * 100)

## Step 5: Build Final Training Table

Combine pCO2 measurements with matching satellite SST data in grid cells.

In [None]:
# Step 5: Build final training table

print("Building final training table...\n")

# Create training table by combining buoy with matching satellite data
training_table = []

for idx, buoy_record in buoy_filtered.iterrows():
    # Find grid for this buoy
    grid = grid_df[grid_df['location'] == buoy_record['location']].iloc[0]
    
    # Find satellite data within this grid and date
    sat_match = satellite_filtered[
        (satellite_filtered['location'] == buoy_record['location']) &
        (satellite_filtered['datetime'].dt.date == buoy_record['datetime'].date()) &
        (satellite_filtered['latitude'] >= grid['lat_min']) &
        (satellite_filtered['latitude'] <= grid['lat_max']) &
        (satellite_filtered['longitude'] >= grid['lon_min']) &
        (satellite_filtered['longitude'] <= grid['lon_max'])
    ]
    
    # If satellite data found for this date, add to training table
    if len(sat_match) > 0:
        # Use average satellite SST for this grid cell
        sat_sst_avg = sat_match['sst_celsius'].mean()
        sat_sst_min = sat_match['sst_celsius'].min()
        sat_sst_max = sat_match['sst_celsius'].max()
        sat_count = len(sat_match)
        
        row = {
            'location': buoy_record['location'],
            'date': buoy_record['datetime'].date(),
            'latitude': buoy_record['latitude'],
            'longitude': buoy_record['longitude'],
            'grid_id': grid['grid_id'],
            'pco2_sw_sat': buoy_record['pco2_sw_sat'],
            'buoy_sst_celsius': buoy_record['sst_celsius'],
            'satellite_sst_avg': sat_sst_avg,
            'satellite_sst_min': sat_sst_min,
            'satellite_sst_max': sat_sst_max,
            'satellite_cells_matched': sat_count,
            'xco2_sw_dry': buoy_record['xco2_sw_dry']
        }
        training_table.append(row)

# Convert to DataFrame
training_df = pd.DataFrame(training_table)

print(f"✓ Final training table built: {len(training_df):,} rows")
print(f"\nColumns: {training_df.columns.tolist()}")
print(f"\nFirst few rows:")
print(training_df.head(10))

print("\n" + "=" * 100)
print("TRAINING TABLE READY FOR ML MODELING")
print("=" * 100)

## Step 6: Data Summary and Export

Analyze and save the final training dataset.

In [None]:
# Step 6: Summary and Export

print("=" * 100)
print("FINAL TRAINING DATASET SUMMARY".center(100))
print("=" * 100)

print(f"\nDataset shape: {training_df.shape[0]:,} rows × {training_df.shape[1]} columns")

print("\nLocations represented:")
for location in sorted(training_df['location'].unique()):
    count = len(training_df[training_df['location'] == location])
    pct = count / len(training_df) * 100
    print(f"  • {location}: {count:,} records ({pct:.1f}%)")

print("\nData statistics:")
print(f"  • Date range: {training_df['date'].min()} to {training_df['date'].max()}")
print(f"  • pCO2 range: {training_df['pco2_sw_sat'].min():.1f} - {training_df['pco2_sw_sat'].max():.1f} µatm")
print(f"  • pCO2 mean: {training_df['pco2_sw_sat'].mean():.1f} ± {training_df['pco2_sw_sat'].std():.1f}")
print(f"  • Satellite SST mean: {training_df['satellite_sst_avg'].mean():.2f}°C (range: {training_df['satellite_sst_avg'].min():.2f} - {training_df['satellite_sst_avg'].max():.2f}°C)")

print("\nMissing values:")
print(training_df.isnull().sum())

# Export the training dataset
output_filename = f"{DATA_PATH}ml_training_continuous_data_{datetime.now().strftime('%Y%m%d')}.csv"
training_df.to_csv(output_filename, index=False)
print(f"\n✓ Exported: {output_filename}")

# Also save a summary
summary_filename = f"{DATA_PATH}ml_training_summary_{datetime.now().strftime('%Y%m%d')}.txt"
with open(summary_filename, 'w') as f:
    f.write("ML TRAINING DATASET SUMMARY\n")
    f.write("=" * 80 + "\n\n")
    f.write(f"Dataset: {output_filename}\n")
    f.write(f"Created: {datetime.now()}\n\n")
    f.write(f"Records: {len(training_df):,}\n")
    f.write(f"Date range: {training_df['date'].min()} to {training_df['date'].max()}\n")
    f.write(f"Locations: {', '.join(sorted(training_df['location'].unique()))}\n\n")
    f.write("Data Quality:\n")
    f.write("  • 100% measured pCO2 (no interpolation)\n")
    f.write("  • All records have matched satellite SST data\n")
    f.write("  • 4 km spatial grid applied\n\n")
    f.write("Columns:\n")
    for col in training_df.columns:
        f.write(f"  • {col}\n")

print(f"✓ Summary saved: {summary_filename}")

print("\n" + "=" * 100)
print("READY FOR ML MODEL TRAINING!")
print("=" * 100)

## Optional: Quick Exploratory Analysis

Uncomment cells below to explore your training data.

In [None]:
# Optional: Create quick visuals of your training data

# Uncomment below to visualize:

# Create scatter plot of pCO2 vs Satellite SST
# fig, axes = plt.subplots(1, 2, figsize=(14, 5))
#
# # Plot 1: pCO2 vs Satellite SST
# for location in training_df['location'].unique():
#     loc_data = training_df[training_df['location'] == location]
#     axes[0].scatter(loc_data['satellite_sst_avg'], loc_data['pco2_sw_sat'], 
#                    label=location, alpha=0.6, s=30)
# axes[0].set_xlabel('Satellite SST (°C)')
# axes[0].set_ylabel('pCO2 (µatm)')
# axes[0].set_title('pCO2 vs Satellite SST')
# axes[0].legend(fontsize=8)
# axes[0].grid(alpha=0.3)
#
# # Plot 2: Time series
# for location in training_df['location'].unique():
#     loc_data = training_df[training_df['location'] == location].sort_values('date')
#     axes[1].plot(loc_data['date'], loc_data['pco2_sw_sat'], label=location, alpha=0.6)
# axes[1].set_xlabel('Date')
# axes[1].set_ylabel('pCO2 (µatm)')
# axes[1].set_title('pCO2 Time Series by Location')
# axes[1].legend(fontsize=8)
# axes[1].grid(alpha=0.3)
# plt.tight_layout()
# plt.show()

print("Exploratory visualization code available above (uncomment to use)")