In [2]:
import geopandas as gpd
from shapely.geometry import Point
import pandas as pd
import numpy as np

In [3]:
def parse_dates(date_series):
    for fmt in ('%Y-%m-%d', '%d-%m-%Y', '%d/%m/%Y', '%Y/%m/%d'):
        try:
            return pd.to_datetime(date_series, format=fmt)
        except ValueError:
            continue
    # fallback: let pandas infer
    return pd.to_datetime(date_series, dayfirst=True)

In [4]:
def df_to_gdf(df):
    df["geometry"] = [Point(xy) for xy in zip(df["Longitude"], df["Latitude"])]
    return gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")

In [5]:
def landsat_map_station(stations_df, landsat_df):
    stations_gdf = df_to_gdf(stations_df)
    landsat_gdf = df_to_gdf(landsat_df)
    
    stations_m = stations_gdf.to_crs(epsg=3857)
    landsat_m  = landsat_gdf.to_crs(epsg=3857)
    
    landsat_to_station = gpd.sjoin_nearest(
        landsat_m, # LEFT: each landsat point
        stations_m,  # RIGHT: station candidates
        how="left",
        distance_col="dist_m"
    )
    
    landsat_to_station["dist_km"] = landsat_to_station["dist_m"] / 1000
    landsat_to_station = landsat_to_station.rename(columns= {'Latitude_left': 'Latitude', 'Longitude_left': 'Longitude', 'Latitude_right': 'Latitude_stations', 'Longitude_right': 'Longitude_stations'})
    
    landsat_to_station['Sample Date'] = parse_dates(landsat_to_station['Sample Date'])
    
    return landsat_to_station

# landsat_to_station.head()

In [6]:
# landsat_to_station.to_csv('data/landsat_to_station.csv', index=False)

## Adding hydrochem properties to the station data
This is the dataset with the date and the STAT_ID

In [7]:
def merge_all(glorich_df, landsat_to_station, name):

    glorich_with_date = glorich_df.dropna(subset=['date'])
    landsat_sorted  = landsat_to_station.sort_values('Sample Date').reset_index(drop=True)
    glorich_sorted  = glorich_with_date.sort_values('date').reset_index(drop=True)
    
    # Nearest-date join, matched within the same station ───────────────────
    #   • direction='nearest'  → picks the glorich row whose date is closest
    #     (before OR after) to the landsat observation date
    #   • by='STAT_ID'         → only considers glorich rows for the same station
    #   • how='left' is implicit in merge_asof; stations with no glorich data
    #     will get NaN for all glorich columns (your case 3) ───────────────────
    
    merged = pd.merge_asof(
        landsat_sorted,
        glorich_sorted,
        left_on='Sample Date',
        right_on='date',
        by='STAT_ID',
        direction='nearest',
        suffixes=('_landsat', '_glorich')
    )
    merged['date_diff_days'] = (
        merged['Sample Date'] - merged['date']
    ).dt.days.abs()
    merged.drop(columns=['index_right', 'Latitude_stations', 'Longitude_stations'], inplace=True)
    merged = merged.rename(columns={'Latitude_landsat': 'Latitude', 'Longitude_landsat': 'Longitude'})
    
    merged = (
        merged
        .sort_values('SpecCond25C_reliability', ascending=False)
        .drop_duplicates(subset=['Latitude', 'Longitude', 'Sample Date'], keep='first')
    )
    landsat_ids = set(landsat_sorted['STAT_ID'].unique())
    glorich_ids = set(glorich_with_date['STAT_ID'].unique())
    
    print("Landsat STAT_IDs:  ", len(landsat_ids))
    print("Glorich STAT_IDs:  ", len(glorich_ids))
    print("Overlapping IDs:   ", len(landsat_ids & glorich_ids))
    print("Merged dataset length:", len(merged))
    
    merged.to_csv(f'data/{name}_ALL.csv', index=False)

    return merged

## Main coding part

In [8]:
# name = specify training, validation, testing
name = 'testing'
stations_df = pd.read_csv('data/stations_with_conditions.csv')
glorich_df = pd.read_csv('data/final_imputed_hydrochem.csv')
glorich_df['date'] = pd.to_datetime(glorich_df['date'], format='%Y-%m-%d')

# just change this to validation / testing set
landsat_df = pd.read_csv('data/Submission+landsat.csv')

In [9]:
stations_df.head()

Unnamed: 0,STAT_ID,Latitude,Longitude,geometry,sc,ss,su,mt,va,vb,...,pi,GLC_Artificial,GLC_Managed,GLC_Water,GLC_Aquatic_Veg,GLC_PERC_COV,Popdens_00,Soil_pH,SOC,Soil_wetness
0,400001,-32.31,18.33,POINT (18.33 -32.31),0.0,0.38,0.23,0.0,0.0,0.0,...,0.0,0.0,0.25,0.005508,0.0,100.0,4.75,6.7,3.08,34.46
1,400002,-32.31,18.34,POINT (18.34 -32.31),0.0,0.38,0.23,0.0,0.0,0.0,...,0.0,0.0,0.25,0.005508,0.0,100.0,4.75,6.7,3.08,34.46
2,400003,-32.31,18.34,POINT (18.34 -32.31),0.0,0.38,0.23,0.0,0.0,0.0,...,0.0,0.0,0.25,0.005511,0.0,100.0,4.7,6.7,3.08,34.46
3,400004,-32.31,18.35,POINT (18.35 -32.31),0.0,0.38,0.23,0.0,0.0,0.0,...,0.0,0.0,0.25,0.005511,0.0,100.0,4.7,6.7,3.08,34.46
4,400005,-32.34,18.42,POINT (18.42 -32.34),0.0,0.38,0.22,0.0,0.0,0.0,...,0.0,0.0,0.25,0.00283,0.0,100.0,4.46,6.7,3.12,34.46


In [10]:
landsat_to_station = landsat_map_station(stations_df, landsat_df)

In [11]:
merged = merge_all(glorich_df, landsat_to_station, name=name)

Landsat STAT_IDs:   24
Glorich STAT_IDs:   876
Overlapping IDs:    21
Merged dataset length: 200


In [12]:
merged[merged['Cl'].isna()]

Unnamed: 0,Latitude,Longitude,Sample Date,Total Alkalinity,Electrical Conductance,Dissolved Reactive Phosphorus,pet,_merge_terra,nir,green,...,SO4,SpecCond25C,pH,Alkalinity_reliability,Cl_reliability,DIP_reliability,SO4_reliability,SpecCond25C_reliability,pH_reliability,date_diff_days
24,-33.18536,27.39075,2011-09-22,,,,161.40001,both,16004.0,10236.0,...,,35.058824,7.878235,,,1.0,,1.0,1.0,7.0
40,-33.18536,27.39075,2012-05-31,,,,136.40001,both,13943.0,9666.0,...,,35.058824,7.878235,,,1.0,,1.0,1.0,1.0
45,-33.18536,27.39075,2012-06-28,,,,136.40001,both,13564.0,8843.0,...,,35.058824,7.878235,,,1.0,,1.0,1.0,3.0
58,-33.18536,27.39075,2012-11-15,,,,153.40001,both,17771.0,9804.0,...,,35.058824,7.878235,,,1.0,,1.0,1.0,0.0
65,-33.18536,27.39075,2012-12-04,,,,153.40001,both,17771.0,9804.0,...,,35.058824,7.878235,,,1.0,,1.0,1.0,3.0
51,-33.18536,27.39075,2012-09-20,,,,153.40001,both,16193.0,9405.0,...,,35.058824,7.878235,,,1.0,,1.0,1.0,5.0
95,-33.18536,27.39075,2013-10-21,,,,165.2,both,18905.5,9320.0,...,,35.058824,7.878235,,,1.0,,1.0,1.0,6.0
212,-33.18536,27.39075,2015-11-18,,,,161.90001,both,20018.0,9304.0,...,,35.058824,7.878235,,,1.0,,1.0,1.0,3.0
207,-33.18536,27.39075,2015-10-29,,,,161.90001,both,20018.0,9304.0,...,,35.058824,7.878235,,,1.0,,1.0,1.0,3.0
202,-33.18536,27.39075,2015-09-28,,,,161.90001,both,15967.0,9733.0,...,,35.058824,7.878235,,,1.0,,1.0,1.0,3.0


In [13]:
test_df = pd.read_csv('data/testing_ALL.csv')

In [14]:
test_df.shape

(200, 55)