In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import json
import sys

from thorr.utils import read_config, Logger, validate_start_end_dates
from thorr.database import Connect as db_connect


# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import root_mean_squared_error, mean_squared_error, r2_score

# from sklearn.model_selection import KFold, ShuffleSplit, RepeatedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from joblib import dump, load

In [None]:
config_file = Path("/Users/gdarkwah/Desktop/CRITFC_20251107/.env/v1.0.0_config.ini")
insitu_data_dir = Path("insitu_data/processed")

In [None]:
config_dict = read_config(config_file)
# project_dir = Path(config_dict["project"]["project_dir"])
project_dir = Path("./")

In [None]:
db = db_connect(config_file,)
schema = db.schema

connection = db.connection
cursor = connection.cursor()


In [None]:
# Retrieve the basin GIS data
basin_query = f"""
SELECT 
    "RegionID" as region_id,
    "Name",
    ST_AsBinary("geometry") AS geometry,
    ST_SRID("geometry") AS srid
FROM {schema}."Regions";
"""

cursor.execute(basin_query)
basin_gdf = gpd.GeoDataFrame(
    cursor.fetchall(),
    columns=[col[0] for col in cursor.description],
)

basin_gdf["geometry"] = gpd.GeoSeries.from_wkb(basin_gdf["geometry"])
basin_gdf = gpd.GeoDataFrame(basin_gdf, geometry="geometry")
basin_gdf = basin_gdf.set_crs(epsg=basin_gdf["srid"].iloc[0])

In [None]:
## Retrieve GIS data for reaches
query = f"""
SELECT
    "ReachID" AS reach_id,
    "Name" AS reach_name,
    "RiverID" AS river_id,
    "RKm",
    ST_AsBinary("buffered_geometry") AS geometry,
    ST_SRID("buffered_geometry") AS srid
FROM
    {schema}."Reaches"
ORDER By
    "ReachID";
"""

cursor.execute(query)
reaches_gdf = pd.DataFrame(
    cursor.fetchall(),
    columns=[col[0] for col in cursor.description],
)
reaches_gdf["geometry"] = gpd.GeoSeries.from_wkb(reaches_gdf["geometry"])
reaches_gdf = gpd.GeoDataFrame(reaches_gdf, geometry="geometry")
reaches_gdf = reaches_gdf.set_crs(epsg=reaches_gdf["srid"].iloc[0])

In [None]:

stations_metadata_path = Path(project_dir, "insitu_data/metadata/stations.csv")

stations_df = pd.read_csv(stations_metadata_path)
stations_gdf = gpd.GeoDataFrame(
    stations_df,
    geometry=gpd.points_from_xy(stations_df.longitude, stations_df.latitude),
    crs="EPSG:4326"
)


stations_fn = "stations_gis/stations.gpkg"
stations = gpd.read_file(stations_fn, layer='Stations')




In [None]:
# plot temperature locations
fig, ax = plt.subplots()
basin_gdf.plot(ax=ax, color='lightgray')
reaches_gdf.plot(ax=ax, color='blue')
stations.plot(ax=ax, color='black', markersize=0.5);

In [None]:
stations_in_reaches = stations[stations.within(reaches_gdf.unary_union)].copy()
reaches_gdf.drop(columns=[col for col in reaches_gdf.columns if col in ['index_right', 'index_left']], inplace=True)
stations_in_reaches.drop(columns=[col for col in stations_in_reaches.columns if col in ['index_right', 'index_left']], inplace=True)

stations_in_reaches = stations_in_reaches.sjoin(reaches_gdf, how='left', predicate='within')
# stations_in_reaches.head()

In [None]:
# filter temperature locations within the buffered
fig, ax = plt.subplots()
basin_gdf.plot(ax=ax, color='lightgray')
reaches_gdf.plot(ax=ax, color='blue')
stations_in_reaches.plot(ax=ax, color='red', markersize=0.5);

In [None]:
# ## Retrieve the reach data necessary for training the model
# query1 = f"""
# SELECT
#     "ReachID",
#     "RKm",
#     "Date",
#     "LandTempC",
#     "WaterTempC",
#     "NDVI",
#     "Mission",
#     "WidthMin",
#     "WidthMean",
#     "WidthMax",
#     "Name",
#     "ClimateClass",
#     "EstTempC"
# FROM
#     {schema}."ReachData"
#     LEFT JOIN {schema}."Reaches" USING ("ReachID")
# WHERE
#     "LandTempC" IS NOT NULL
#     AND "NDVI" IS NOT NULL;
# """

# with connection.cursor() as cursor:
#     cursor.execute(query1)
#     lsat_data = pd.DataFrame(
#         cursor.fetchall(), columns=[desc[0] for desc in cursor.description]
#     )
#     lsat_data["Date"] = pd.to_datetime(lsat_data["Date"])

lsat_data = pd.read_csv(
    project_dir / "thorr_data/lsat_data.csv"
)
lsat_data["Date"] = pd.to_datetime(lsat_data["Date"])

In [None]:
# create a DOY column
lsat_data["DOY"] = lsat_data["Date"].dt.dayofyear
# fill na values of the mean width values with 15
lsat_data[["WidthMean"]] = lsat_data[["WidthMean"]].fillna(15)

In [None]:
# define features
features = [
    "NDVI",
    "LandTempC",
    "ClimateClass",
    "DOY",
    "WidthMean",
]


In [None]:
combined_insitu = pd.DataFrame()

for station_loc in stations_in_reaches.itertuples():
    insitu_fn = insitu_data_dir / f"{station_loc.station_ID}.csv"
    if insitu_fn.exists():
        insitu_data = pd.read_csv(insitu_fn, parse_dates=['date'])
        # print(insitu_data.columns)
        # drop utc in date
        insitu_data['date'] = insitu_data['date'].dt.tz_localize(None)

        # keep date and average temperature
        # insitu_data = insitu_data[['date', 'avg_temp(C)', ]]
        # replace -999999 with nan
        # insitu_data.replace(-999999, np.nan, inplace=True)
        insitu_data.dropna(inplace=True)
        insitu_data['station_ID'] = station_loc.station_ID
        insitu_data['RKm'] = station_loc.RKm
        insitu_data['Name'] = station_loc.reach_name
        combined_insitu = pd.concat([combined_insitu, insitu_data])
        # break

# combined_insitu = pd.concat(combined_insitu)

combined_insitu.rename(columns={'date': 'Date'}, inplace=True)
# combined_insitu = combined_insitu.merge(lsat_data, on='Date', how='left')
# Data/insitu/conditions/processed
# combined_insitu

In [None]:
insitu_lsat = pd.merge(combined_insitu, lsat_data, on=['Date', 'Name', 'RKm'], how='inner')
all_data = lsat_data.merge(combined_insitu, on=['Date', 'Name', 'RKm'], how='outer')
insitu_lsat_clean = insitu_lsat[(insitu_lsat['avg_temp(C)'] >=0) & (insitu_lsat['WaterTempC'] >=-40) & (insitu_lsat['WaterTempC'] <= 80) & (insitu_lsat['avg_temp(C)'] <= 40)].copy()
insitu_lsat_clean[['WidthMin', 'WidthMean', 'WidthMax']] = insitu_lsat_clean[['WidthMin', 'WidthMean', 'WidthMax']].fillna(15)
all_data[['WidthMin', 'WidthMean', 'WidthMax']] = all_data[['WidthMin', 'WidthMean', 'WidthMax']].fillna(15)


insitu_lsat_clean["DOY"] = insitu_lsat_clean["Date"].dt.dayofyear
all_data["DOY"] = all_data["Date"].dt.dayofyear

# insitu_lsat_clean.to_csv('ml_inputs_data.csv', index=False)

In [None]:
###
insitu_lsat_clean = all_data.copy()
insitu_lsat_clean.to_csv('ml_inputs_data.csv', index=False)
###

In [None]:
# define scalers
doy_scaler = MinMaxScaler(feature_range=(0, 1)).fit(pd.DataFrame({'DOY': range(1, 365)}))
watertemp_scaler = StandardScaler().fit(insitu_lsat_clean[['WaterTempC']])
landtemp_scaler = StandardScaler().fit(insitu_lsat_clean[['LandTempC']])
# watertemp_scaler = MinMaxScaler(feature_range=(0, 1)).fit(df3[['WaterTemp']])
# landtemp_scaler = MinMaxScaler(feature_range=(0, 1)).fit(df3[['LandTemp']])
width_max_scaler = MinMaxScaler(feature_range=(0, 1)).fit(insitu_lsat_clean[['WidthMax']])
width_mean_scaler = MinMaxScaler(feature_range=(0, 1)).fit(insitu_lsat_clean[['WidthMean']])
width_min_scaler = MinMaxScaler(feature_range=(0, 1)).fit(insitu_lsat_clean[['WidthMin']])
NDVI_scaler = StandardScaler().fit(insitu_lsat_clean[['NDVI']])
# NDVI_scaler = MinMaxScaler(feature_range=(-1, 1)).fit(df3[['NDVI']])
climate_scaler = MinMaxScaler(feature_range=(0, 1)).fit(pd.DataFrame({'ClimateClass': range(1, 30+1)}))
avg_temp_scaler = StandardScaler().fit(insitu_lsat_clean[['avg_temp(C)']])

# define a directory to save the scalers
scalers_dir = Path('scalers')
scalers_dir.mkdir(parents=True, exist_ok=True)

# save the scalers
dump(doy_scaler, 'scalers/doy_scaler.joblib')
dump(watertemp_scaler, 'scalers/watertemp_scaler.joblib')
dump(landtemp_scaler, 'scalers/landtemp_scaler.joblib')
dump(width_max_scaler, 'scalers/width_max_scaler.joblib')
dump(width_mean_scaler, 'scalers/width_mean_scaler.joblib')
dump(width_min_scaler, 'scalers/width_min_scaler.joblib')
dump(NDVI_scaler, 'scalers/NDVI_scaler.joblib')
dump(climate_scaler, 'scalers/climate_scaler.joblib')
dump(avg_temp_scaler, 'scalers/avg_temp_scaler.joblib')

# # load the scalers
# doy_scaler = load('scalers/doy_scaler.joblib')
# watertemp_scaler = load('scalers/watertemp_scaler.joblib')
# landtemp_scaler = load('scalers/landtemp_scaler.joblib')
# width_max_scaler = load('scalers/width_max_scaler.joblib')
# width_mean_scaler = load('scalers/width_mean_scaler.joblib')
# width_min_scaler = load('scalers/width_min_scaler.joblib')
# NDVI_scaler = load('scalers/NDVI_scaler.joblib')
# climate_scaler = load('scalers/climate_scaler.joblib')
# avg_temp_scaler = load('scalers/avg_temp_scaler.joblib')


# # scale the data
# insitu_lsat_clean['DOY_scaled'] = doy_scaler.transform(insitu_lsat_clean[['DOY']])
# insitu_lsat_clean['WaterTempC_scaled'] = watertemp_scaler.transform(insitu_lsat_clean[['WaterTempC']])
# insitu_lsat_clean['LandTempC_scaled'] = landtemp_scaler.transform(insitu_lsat_clean[['LandTempC']])
# insitu_lsat_clean['WidthMax_scaled'] = width_max_scaler.transform(insitu_lsat_clean[['WidthMax']])
# insitu_lsat_clean['WidthMean_scaled'] = width_mean_scaler.transform(insitu_lsat_clean[['WidthMean']])
# insitu_lsat_clean['WidthMin_scaled'] = width_min_scaler.transform(insitu_lsat_clean[['WidthMin']])
# insitu_lsat_clean['NDVI_scaled'] = NDVI_scaler.transform(insitu_lsat_clean[['NDVI']])
# insitu_lsat_clean['ClimateClass_scaled'] = climate_scaler.transform(insitu_lsat_clean[['ClimateClass']])
# insitu_lsat_clean['avg_temp(C)_scaled'] = avg_temp_scaler.transform(insitu_lsat_clean[['avg_temp(C)']])

# # insitu_lsat_clean.to_csv('ml_input_data.csv', index=False)