# Processing lineP data from waterproperties. 
The data was downloaded from www.waterproperties.ca as .ctd files. we used the LineP area which inlcudes data in between stations. This script will process the raw data into a dataframe and save it as a csv. We only inlude profiles that are within 10km of a station, or 25km for station P26.

The data was extracted to netcdf using the quality control flags. This script loads these in to create a dataframe for each year and subsample so exclude any data that is far fromt he station. 

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import numpy as np
from scipy.spatial import cKDTree
from pyproj import Geod
import xarray as xr
import os
import glob
from datetime import datetime, timedelta

ERROR 1: PROJ: proj_create_from_database: Open of /home/amh001/space_fs7/software_2022/python/py_2024/share/proj failed


In [3]:
# %%
def load_temperature_profiles(file_paths):
    """
    Load profiles including temperature, salinity, pressure, and oxygen variables from multiple NetCDF files.
    
    Parameters:
        file_paths (list of str): List of paths to NetCDF files.
    
    Returns:
        pd.DataFrame: Combined DataFrame with time index and columns for filename, pressure, temperature,
                      salinity, latitude, longitude, and oxygen variables.
    """
    oxygen_mapping = {
        'oxygen': 'OXYGEN_MMOL_M3',
        'oxy_umol_kg': 'OXYGEN_UMOL_KG'
    }
    
    all_data = []
    
    for path in file_paths:
        try:
            ds = xr.open_dataset(path, decode_cf=True)
            cast_time = pd.to_datetime(ds['time'].values[0])
    
            # Always require temperature, salinity, and pressure
            temperature = ds['temperature'].values
            salinity = ds['salinity'].values
            pressure = ds['Pres'].values
            filename = os.path.basename(path)
    
            # Ensure pressure is 1D
            if pressure.ndim > 1:
                pressure = pressure.flatten()
            
            # Prepare dictionary with always-present variables
            var_data = {
                'time': cast_time,
                'file': filename,
                'CTDPRS_DBAR': pressure,
                'CTDTMP_ITS90_DEG_C': temperature,
                'SALINITY_PSS78': salinity
            }
    
            # Extract latitude and longitude (if available; if not, use NaN)
            var_data['latitude'] = ds['latitude'].values.item() if 'latitude' in ds else np.nan
            var_data['longitude'] = ds['longitude'].values.item() if 'longitude' in ds else np.nan
    
            # Always include oxygen variables
            for netcdf_var, output_col in oxygen_mapping.items():
                if netcdf_var in ds and ds[netcdf_var].values.size > 0:
                    data = ds[netcdf_var].values
                    if not np.all(np.isnan(data)):
                        var_data[output_col] = data
                    else:
                        var_data[output_col] = np.full(len(pressure), np.nan, dtype=float)
                else:
                    var_data[output_col] = np.full(len(pressure), np.nan, dtype=float)
    
            # Create DataFrame for the current file
            df = pd.DataFrame(var_data)
            all_data.append(df)
        except Exception as e:
            print(f"Failed to process {path}: {e}")
    
    # Combine data from all files
    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        combined_df.set_index('time', inplace=True)
        return combined_df
    else:
        return pd.DataFrame()

# %%
def process_line_p_data(start_year=1969, end_year=2020):
    path0 = '/gpfs/fs7/dfo/hpcmc/pfm/amh001/DATA/OUTPUTS_LINEP/CTD/'
    outpath = '/gpfs/fs7/dfo/hpcmc/pfm/amh001/DATA/OUTPUTS_LINEP/'
    figures_dir = 'Figures'
    
    # Create figures directory if it doesn't exist
    os.makedirs(figures_dir, exist_ok=True)
    
    def latlon_to_xyz(lat, lon):
        """Convert lat/lon (in radians) to 3D Cartesian coordinates."""
        R = 6371  # Earth radius in km
        x = R * np.cos(lat) * np.cos(lon)
        y = R * np.cos(lat) * np.sin(lon)
        z = R * np.sin(lat)
        return np.column_stack((x, y, z))
    
    print("Loading Line P station coordinates...")
    linep_df = pd.read_csv("/gpfs/fs7/dfo/hpcmc/pfm/amh001/DATA/SD-Ocean/observations_LineP/LineP.csv")
    linep_coords_rad = np.radians(linep_df[['latitude', 'longitude']].values)
    
    print("Building spatial index...")
    linep_xyz = latlon_to_xyz(linep_coords_rad[:, 0], linep_coords_rad[:, 1])
    tree = cKDTree(linep_xyz)
    
    station_names = linep_df['station_name'].values
    distance_thresholds = np.where(station_names == 'P26', 25.0, 10.0)
    
    for year in range(start_year, end_year + 1):
        print(f"\nProcessing year {year}...")
        nc_files = glob.glob(os.path.join(path0, f"*CastCTD_{year}*.nc"))
    
        if not nc_files:
            print(f"  No files found for year {year}")
            continue
    
        df_profiles = load_temperature_profiles(nc_files)
    
        if df_profiles.empty:
            print(f"  No valid profiles for year {year}")
            continue
    
        # Convert profile coordinates to xyz
        profile_coords_rad = np.radians(df_profiles[['latitude', 'longitude']].values)
        profile_xyz = latlon_to_xyz(profile_coords_rad[:, 0], profile_coords_rad[:, 1])
    
        # Query nearest stations
        distances, indices = tree.query(profile_xyz, k=1)
    
        df_profiles['closest_linep_station_name'] = station_names[indices]
        df_profiles['distance_to_closest_station_km'] = distances
    
        profile_thresholds = distance_thresholds[indices]
        kept_mask = distances < profile_thresholds
        kept_profiles = df_profiles[kept_mask]
        filtered_profiles = df_profiles[~kept_mask]
    
        # Save CSV
        if not kept_profiles.empty:
            output_filename = os.path.join(outpath, f'LineP_ctds_{year}.csv')
            kept_profiles.to_csv(output_filename, na_rep='NaN')
            
            station_counts = kept_profiles['closest_linep_station_name'].value_counts()
            print(f"  Saved {len(kept_profiles)} profiles for {year}:")
            for station, count in station_counts.items():
                threshold = 25 if station == 'P26' else 10
                print(f"    {station}: {count} profiles (within {threshold} km)")
        else:
            print(f"  No profiles within threshold distances for year {year}")
    
        # Generate map plot
        if not kept_profiles.empty or not filtered_profiles.empty:
            fig, ax = plt.subplots(figsize=(10, 8), subplot_kw={'projection': ccrs.PlateCarree()})
            ax.coastlines()
            ax.gridlines(draw_labels=False)
            
            ax.scatter(linep_df['longitude'], linep_df['latitude'],
                       color='blue', marker='x', s=50, linewidth=0.5, zorder=100,
                       label='Line P Stations')
            
            if not kept_profiles.empty:
                ax.scatter(kept_profiles['longitude'], kept_profiles['latitude'], 
                           facecolors='none', edgecolors='orange', s=20, alpha=0.7,
                           label=f'Kept Profiles ({len(kept_profiles)})')
            
            if not filtered_profiles.empty:
                ax.scatter(filtered_profiles['longitude'], filtered_profiles['latitude'],
                           facecolors='none', edgecolors='red', marker='^', s=20, alpha=0.7,
                           label=f'Eliminated Profiles ({len(filtered_profiles)})')
            
            ax.legend()
            ax.set_extent([-147, -123, 47, 52], crs=ccrs.PlateCarree())
            plot_filename = os.path.join(figures_dir, f'LineP_ctds_map_{year}.png')
            plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
            plt.close()
            print(f"  Saved plot to {plot_filename}")

# %%
process_line_p_data(start_year=1997, end_year=2020)

Loading Line P station coordinates...
Building spatial index...

Processing year 1997...
  Saved 233980 profiles for 1997:
    P26: 20636 profiles (within 25 km)
    P8: 14588 profiles (within 10 km)
    P4: 14482 profiles (within 10 km)
    P12: 14010 profiles (within 10 km)
    P16: 13471 profiles (within 10 km)
    P10: 11320 profiles (within 10 km)
    P9: 10738 profiles (within 10 km)
    P6: 10733 profiles (within 10 km)
    P11: 10570 profiles (within 10 km)
    P5: 10409 profiles (within 10 km)
    P7: 10103 profiles (within 10 km)
    P3: 9260 profiles (within 10 km)
    P20: 9233 profiles (within 10 km)
    P15: 7211 profiles (within 10 km)
    P23: 7018 profiles (within 10 km)
    P17: 7017 profiles (within 10 km)
    P14: 7015 profiles (within 10 km)
    P24: 7014 profiles (within 10 km)
    P19: 7003 profiles (within 10 km)
    P13: 5528 profiles (within 10 km)
    P21: 5509 profiles (within 10 km)
    P18: 5503 profiles (within 10 km)
    P35: 4507 profiles (within 10 km)

In [8]:
process_line_p_data(start_year=2007, end_year=2008)

Loading Line P station coordinates...
Building spatial index...
Processing year 2007...
Saved 194675 profiles for 2007:
  P26: 29539 profiles (within 25 km)
  P20: 16827 profiles (within 10 km)
  P16: 14554 profiles (within 10 km)
  P12: 14510 profiles (within 10 km)
  P18: 6012 profiles (within 10 km)
  P19: 6012 profiles (within 10 km)
  P17: 6009 profiles (within 10 km)
  P23: 6008 profiles (within 10 km)
  P35: 6005 profiles (within 10 km)
  P24: 6000 profiles (within 10 km)
  P22: 5999 profiles (within 10 km)
  P21: 5995 profiles (within 10 km)
  P9: 5917 profiles (within 10 km)
  P8: 5901 profiles (within 10 km)
  P11: 5876 profiles (within 10 km)
  P14: 5871 profiles (within 10 km)
  P4: 5590 profiles (within 10 km)
  P15: 5564 profiles (within 10 km)
  P5: 5550 profiles (within 10 km)
  P10: 5541 profiles (within 10 km)
  P6: 5346 profiles (within 10 km)
  P13: 5174 profiles (within 10 km)
  P7: 5163 profiles (within 10 km)
  P25: 4906 profiles (within 10 km)
  P3: 3128 profile