# Join Stations with Distance to Coast

This notebook joins the stations data with dist2coast data first, then joins the result back with the weather data.


In [1]:
# Import required libraries
import pandas as pd
import dask.dataframe as dd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path


In [2]:
# Step 1: Load the stations data (not the joined weather data yet)
print("Loading stations data...")
stations = dd.read_parquet('../weather_info/stations_dask.parquet')
print(f"Stations shape: {stations.shape}")
print(f"Columns: {list(stations.columns)}")
print("Sample of stations data:")
print(stations.head())


Loading stations data...
Stations shape: (Delayed('int-f42481e9-e6fe-44c2-880c-cfc85f7053b3'), 9)
Columns: ['station_id', 'latitude', 'longitude', 'elevation', 'state', 'name', 'gsn_flag', 'hcn_crn_flag', 'wmo_id']
Sample of stations data:
    station_id  latitude  longitude  elevation state                   name  \
0  ACW00011604   17.1167   -61.7833       10.1  None  ST JOHNS COOLIDGE FLD   
1  ACW00011647   17.1333   -61.7833       19.2  None               ST JOHNS   
2  AE000041196   25.3330    55.5170       34.0  None    SHARJAH INTER. AIRP   
3  AEM00041194   25.2550    55.3640       10.4  None             DUBAI INTL   
4  AEM00041217   24.4330    54.6510       26.8  None         ABU DHABI INTL   

  gsn_flag hcn_crn_flag   wmo_id  
0     None         None      NaN  
1     None         None      NaN  
2      GSN         None  41196.0  
3     None         None  41194.0  
4     None         None  41217.0  


In [3]:
# Step 2: Load the dist2coast data
print("Loading dist2coast data...")
dist2coast = dd.read_parquet('../../../weather_data/dist2coast.parquet')
print(f"Dist2coast shape: {dist2coast.shape}")
print("Dist2coast columns:", list(dist2coast.columns))
print("First few rows:")
print(dist2coast.head())


Loading dist2coast data...
Dist2coast shape: (Delayed('int-691548e1-334a-421b-8acf-b02534891340'), 3)
Dist2coast columns: ['lon', 'lat', 'dist']
First few rows:


      lon    lat     dist
0 -179.98  89.98  712.935
1 -179.94  89.98  712.934
2 -179.90  89.98  712.933
3 -179.86  89.98  712.932
4 -179.82  89.98  712.932


In [4]:
# Step 3: Prepare the dist2coast data for joining
print("Preparing dist2coast data for joining...")

# The dist2coast data has columns: lon, lat, dist
# We need to rename them to match the stations data format
dist2coast_renamed = dist2coast.rename(columns={
    'lon': 'longitude',
    'lat': 'latitude', 
    'dist': 'dist_to_coast'
})

print("Renamed dist2coast columns:", list(dist2coast_renamed.columns))
print("Sample of renamed data:")
print(dist2coast_renamed.head())


Preparing dist2coast data for joining...
Renamed dist2coast columns: ['longitude', 'latitude', 'dist_to_coast']
Sample of renamed data:


   longitude  latitude  dist_to_coast
0    -179.98     89.98        712.935
1    -179.94     89.98        712.934
2    -179.90     89.98        712.933
3    -179.86     89.98        712.932
4    -179.82     89.98        712.932


In [5]:
# Step 4: Join stations with dist2coast data
def find_closest_dist_to_coast(stations_df, dist2coast_df):
    """
    Find the closest distance-to-coast point for each weather station.
    This uses a spatial join based on latitude and longitude proximity.
    """
    # Convert to pandas for easier manipulation
    stations_pd = stations_df[['station_id', 'latitude', 'longitude']].compute()
    dist2coast_pd = dist2coast_df.compute()
    
    print(f"Processing {len(stations_pd)} stations...")
    
    # For each station, find the closest dist2coast point
    distances_to_coast = []
    
    # Round latitudes and longitudes in both dataframes to 0.1 resolution
    stations_pd['lat_round'] = stations_pd['latitude'].round(1)
    stations_pd['lon_round'] = stations_pd['longitude'].round(1)
    dist2coast_pd['lat_round'] = dist2coast_pd['latitude'].round(1)
    dist2coast_pd['lon_round'] = dist2coast_pd['longitude'].round(1)

    # Perform a left join on (lat_round, lon_round)
    merged = stations_pd.merge(
        dist2coast_pd[['lat_round', 'lon_round', 'dist_to_coast']],
        how='left',
        on=['lat_round', 'lon_round']
    )

    # If there are multiple matches, drop duplicates and keep first
    merged = merged.drop_duplicates(subset=['station_id'])

    # Some stations may not match; handle missing if you want (otherwise, will be NaN)
    merged = merged.drop(columns=['lat_round', 'lon_round'])
    return merged
# Apply the function to join stations with dist2coast
print("Finding closest distance-to-coast for each station...")
stations_with_dist = find_closest_dist_to_coast(stations, dist2coast_renamed)


Finding closest distance-to-coast for each station...


Processing 129658 stations...


In [6]:

n_nans = stations_with_dist['dist_to_coast'].isna().sum()
print(f"Number of NaNs in 'dist_to_coast' column: {n_nans}")


Number of NaNs in 'dist_to_coast' column: 0


In [7]:
# Step 5: Examine the stations with distance to coast
print("=== STATIONS WITH DISTANCE TO COAST ===")
print(f"Shape: {stations_with_dist.shape}")
print("\nSample of results:")
print(stations_with_dist.head(10))

print("\nDistance to coast statistics:")
print(stations_with_dist['dist_to_coast'].describe())


=== STATIONS WITH DISTANCE TO COAST ===
Shape: (129658, 4)

Sample of results:
     station_id  latitude  longitude  dist_to_coast
0   ACW00011604   17.1167   -61.7833       3.464260
6   ACW00011647   17.1333   -61.7833       3.464260
12  AE000041196   25.3330    55.5170       5.934160
21  AEM00041194   25.2550    55.3640       0.058871
27  AEM00041217   24.4330    54.6510       9.052790
33  AEM00041218   24.2620    55.6090      90.617700
39  AF000040930   35.3170    69.0170    1104.580000
45  AFM00040938   34.2100    62.2280     937.753000
49  AFM00040948   34.5660    69.2120    1031.730000
53  AFM00040990   31.5000    65.8500     658.224000

Distance to coast statistics:
count    129658.000000
mean        442.659708
std         431.940996
min           0.000153
25%          69.372525
50%         285.004000
75%         755.495000
max        2488.100000
Name: dist_to_coast, dtype: float64


In [11]:
!ls ../../../weather_data/joined_stations_weather.parquet

ls: cannot access '../../../weather_data/joined_stations_weather.parquet': No such file or directory


In [10]:
# Step 6: Now load the joined stations weather data
print("Loading joined stations weather data...")
stations_weather = dd.read_parquet('../../../weather_data/joined_stations_weather.parquet')
print(f"Stations weather shape: {stations_weather.shape}")
print(f"Columns: {list(stations_weather.columns)[:10]}...")  # Show first 10 columns

import psutil
print(f"Memory usage: {psutil.Process().memory_info().rss / 1024**2:.1f} MB")

Loading joined stations weather data...


FileNotFoundError: An error occurred while calling the read_parquet method registered to the pandas backend.
Original Message: /home/yfreund/dask-CSE255/weather/dist_2_coast/../../../weather_data/joined_stations_weather.parquet

In [None]:

# Additional analysis of stations_weather (optional)
print("\nPartitioning stations_weather by 'measurement_type' and 'year' and reporting partition sizes:")

# If Dask DataFrame, convert only necessary columns to pandas for groupby, else compute groupby using Dask.
measurement_type_col = 'measurement_type'
year_col = 'year'

# We'll use Dask for efficient groupby and counting
partition_counts = stations_weather.groupby([measurement_type_col, year_col]).size().compute()
partition_counts = partition_counts.reset_index().rename(columns={0: 'count'})

print("Number of rows per (measurement_type, year) partition:")
for _, row in partition_counts.iterrows():
    print(f"  {row[measurement_type_col]!r}, {row[year_col]!r}: {row['count']} rows")

Loading joined stations weather data...


FileNotFoundError: An error occurred while calling the read_parquet method registered to the pandas backend.
Original Message: /home/yfreund/dask-CSE255/weather/dist_2_coast/../../../weather_data/joined_stations_weather.parquet

In [None]:
# Step 7: Join the stations_with_dist back with the weather data
print("Joining stations with distance-to-coast data with weather data...")

# Convert stations_weather to pandas for the join
# stations_weather_pd = stations_weather.compute()

# Merge the distance-to-coast data
# Note: stations_weather uses 'ID' column, stations uses 'station_id'
stations_weather_with_dist = stations_weather.merge(
    stations_with_dist[['station_id', 'dist_to_coast']], 
    left_on='ID', 
    right_on='station_id', 
    how='left'
)


Joining stations with distance-to-coast data with weather data...


In [None]:
# Step 8: Save the enhanced dataset
print("Saving the enhanced dataset...")

# Save as parquet
output_file = '../../../weather_data/stations_weather_with_dist2coast.parquet'
import shutil
import os

# Remove the output_file parquet directory if it exists before saving new data
if os.path.exists(output_file):
    shutil.rmtree(output_file)

# Save as a Parquet directory (multiple files) using pyarrow
stations_weather_with_dist.to_parquet(output_file, write_index=False, engine='pyarrow')
print(f"Saved to: {output_file}")


print("Done!")


Saving the enhanced dataset...
Saved to: ../../../weather_data/stations_weather_with_dist2coast.parquet
Done!


In [None]:
stations_weather_with_dist.columns


Index(['station_id_x', 'latitude', 'longitude', 'elevation', 'state', 'name',
       'gsn_flag', 'hcn_crn_flag', 'wmo_id', 'ID',
       ...
       'day_358', 'day_359', 'day_360', 'day_361', 'day_362', 'day_363',
       'day_364', 'day_365', 'station_id_y', 'dist_to_coast'],
      dtype='object', length=379)