In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import json
import sys

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_squared_error, r2_score

from sklearn.model_selection import KFold, ShuffleSplit, RepeatedKFold, train_test_split, ParameterGrid
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import ElasticNetCV, ElasticNet

from joblib import dump, load

from permetrics.regression import RegressionMetric
import os



In [2]:
proj_dir = Path('../..')

In [3]:
geopackage_fn = proj_dir / 'Data/GIS/dams-temp-fish.gpkg'
thorr_packge_dir = proj_dir / "../01-THORR/src/THORR"
# landsat_data_fn = proj_dir / 'Data/landsat/columbia_clean.csv'
landsat_data_fn = proj_dir / 'Data/landsat/crb_landsat.csv'
insitu_data_dir = proj_dir / 'Data/insitu/conditions/processed'
all_data_fn = proj_dir / 'Methods/02-ML_development/input_data/ml_input_data_all.csv'

model_fn = proj_dir / 'Methods/02-ML_development/var1/RFR1.joblib'

In [4]:
utils = os.path.abspath(thorr_packge_dir/ 'utils')
sys.path.insert(0, utils)

from sql import connect # utility functions for connecting to MySQL

In [5]:
# Create a connection object to the MySQL database
# conn = connect.Connect(str(proj_dir / "Methods/2.Data/DBManagement/mysql_config.ini"))
conn = connect.Connect(str(proj_dir / ".env/mysql_config.ini"))
connection = conn.conn
cursor = conn.conn.cursor()

Connecting to MySQL database...
MySQL connection established.


In [6]:
lsat_data = pd.read_csv(landsat_data_fn, parse_dates=['Date'])
all_data_ = pd.read_csv(all_data_fn, parse_dates=['Date'])

  all_data_ = pd.read_csv(all_data_fn, parse_dates=['Date'])


In [7]:

temp_locs_in_buffer = gpd.read_file(geopackage_fn, layer='TempLocationsInBuffer')

In [8]:
combined_insitu = pd.DataFrame()

for temp_loc in temp_locs_in_buffer.itertuples():
    insitu_fn = insitu_data_dir / f"{temp_loc.station_ID}.csv"
    if insitu_fn.exists():
        insitu_data = pd.read_csv(insitu_fn, parse_dates=['date'])
        # print(insitu_data.columns)
        # drop utc in date
        insitu_data['date'] = insitu_data['date'].dt.tz_localize(None)

        # keep date and average temperature
        # insitu_data = insitu_data[['date', 'avg_temp(C)', ]]
        # replace -999999 with nan
        # insitu_data.replace(-999999, np.nan, inplace=True)
        insitu_data.dropna(inplace=True)
        insitu_data['station_ID'] = temp_loc.station_ID
        insitu_data['RKm'] = temp_loc.RKm
        insitu_data['Name'] = temp_loc.reach_id
        insitu_data['GNIS_Name'] = temp_loc.GNIS_Name
        combined_insitu = pd.concat([combined_insitu, insitu_data])
        # break

# combined_insitu = pd.concat(combined_insitu)

combined_insitu.rename(columns={'date': 'Date'}, inplace=True)
# combined_insitu = combined_insitu.merge(lsat_data, on='Date', how='left')
# Data/insitu/conditions/processed
combined_insitu

Unnamed: 0,Date,outflow(m3/d),inflow(m3/d),spill(m3/d),avg_temp(C),wse(m),station_ID,RKm,Name,GNIS_Name,wse_tail(m),max_temp(C),min_temp(C),avg discharge (cfs)
1945,2004-04-29,4.932288e+07,6.923797e+07,0.0,11.89,626.245128,DART_ALF,140,Pend_Oreille_River_15,Pend Oreille River,,,,
1946,2004-04-30,4.991006e+07,6.654675e+07,0.0,11.84,626.318280,DART_ALF,140,Pend_Oreille_River_15,Pend Oreille River,,,,
1947,2004-05-01,4.937181e+07,5.945169e+07,0.0,12.29,626.354856,DART_ALF,140,Pend_Oreille_River_15,Pend Oreille River,,,,
1948,2004-05-02,4.856444e+07,6.067497e+07,0.0,12.26,626.364000,DART_ALF,140,Pend_Oreille_River_15,Pend Oreille River,,,,
1949,2004-05-03,5.531698e+07,8.563000e+07,0.0,12.92,626.382288,DART_ALF,140,Pend_Oreille_River_15,Pend Oreille River,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
484,1998-09-11,,,,20.80,,USGS_460923123235800,60,Columbia_River_7,Columbia River,,20.9,20.7,
485,1998-09-12,,,,20.80,,USGS_460923123235800,60,Columbia_River_7,Columbia River,,20.9,20.6,
486,1998-09-13,,,,20.60,,USGS_460923123235800,60,Columbia_River_7,Columbia River,,20.7,20.4,
487,1998-09-14,,,,20.50,,USGS_460923123235800,60,Columbia_River_7,Columbia River,,20.6,20.4,


In [10]:
insitu_lsat = pd.merge(
    lsat_data, combined_insitu, on=["Date", "Name", "RKm"], how="outer"
)
nan_fill = -9
insitu_lsat.fillna(nan_fill, inplace=True)
insitu_lsat_clean = insitu_lsat[
    # (insitu_lsat['avg_temp(C)'] >=0) &
    (insitu_lsat["WaterTempC"] >= -40)
    & (insitu_lsat["WaterTempC"] <= 80)
    # & (insitu_lsat["avg_temp(C)"] <= 40)
].copy()
insitu_lsat_clean[["WidthMin", "WidthMean", "WidthMax"]] = insitu_lsat_clean[
    ["WidthMin", "WidthMean", "WidthMax"]
].fillna(15)
insitu_lsat_clean.replace(-9, np.nan, inplace=True)
insitu_lsat_clean["DOY"] = insitu_lsat_clean["Date"].dt.dayofyear

# insitu_lsat_clean.to_csv('ml_inputs_data.csv', index=False)

In [11]:
col_data = insitu_lsat_clean[insitu_lsat_clean["Name"].str.startswith("Columbia")].dropna(subset=['LandTempC', 'NDVI']).copy()

In [12]:
rfr = load(model_fn)

In [13]:
features = [
    "NDVI",
    "LandTempC",
    "ClimateClass",
    "DOY",
    # "WidthMin",
    "WidthMean",
    # "WidthMax",
    # "WaterTempC",
]

In [15]:
insitu_lsat_clean.replace(nan_fill, np.nan, inplace=True)
insitu_lsat_clean.dropna(subset=['LandTempC', 'NDVI'], inplace=True)
insitu_lsat_clean['est_temp'] = rfr.predict(insitu_lsat_clean[features])

In [16]:
insitu_lsat_clean.to_csv('reconstructed_data.csv', index=False)

In [15]:
insitu_lsat_clean.columns

Index(['ReachID', 'Date', 'LandTempC', 'WaterTempC', 'NDVI', 'Mission', 'Name',
       'ClimateClass', 'RKm', 'WidthMin', 'WidthMean', 'WidthMax',
       'outflow(m3/d)', 'inflow(m3/d)', 'spill(m3/d)', 'avg_temp(C)', 'wse(m)',
       'station_ID', 'GNIS_Name', 'wse_tail(m)', 'max_temp(C)', 'min_temp(C)',
       'avg discharge (cfs)', 'DOY', 'est_temp'],
      dtype='object')

In [19]:
insitu_lsat_clean[1000000:]

Unnamed: 0,ReachID,Date,LandTempC,WaterTempC,NDVI,Mission,Name,ClimateClass,RKm,WidthMin,...,avg_temp(C),wse(m),station_ID,GNIS_Name,wse_tail(m),max_temp(C),min_temp(C),avg discharge (cfs),DOY,est_temp
1017619,834.0,2014-10-13,20.02050,15.73320,0.127400,L7,Snake_River_48,7.0,470,30.0,...,,,,,,,,,286,14.571000
1017620,834.0,1999-07-25,34.54200,24.15670,0.174452,L7,Snake_River_48,7.0,470,30.0,...,,,,,,,,,206,22.565000
1017621,834.0,1999-07-16,34.94400,23.71910,0.177419,L7,Snake_River_48,7.0,470,30.0,...,,,,,,,,,197,21.791000
1017622,834.0,1999-07-09,35.43330,24.20780,0.201182,L7,Snake_River_48,7.0,470,30.0,...,,,,,,,,,190,20.213000
1017623,834.0,1999-06-30,36.55480,23.21670,0.216862,L7,Snake_River_48,7.0,470,30.0,...,,,,,,,,,181,19.823000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1290949,1060.0,2022-08-28,18.02820,,0.912089,L9,Youngs_River_5,9.0,40,,...,,,,,,,,,240,18.606017
1290950,1060.0,2022-09-06,25.39620,,0.919168,L9,Youngs_River_5,9.0,40,,...,,,,,,,,,249,21.054806
1290951,1060.0,2022-10-08,22.69580,,0.901385,L9,Youngs_River_5,9.0,40,,...,,,,,,,,,281,18.446828
1290952,1060.0,2022-10-15,23.26610,,0.896315,L9,Youngs_River_5,9.0,40,,...,,,,,,,,,288,17.254110


In [20]:
# Insert basin data into the table if the entry doesn't already exist
for i, row in insitu_lsat_clean.iterrows():
    if i % 10000 == 0:
        print(f"Processing row {i} of {len(insitu_lsat_clean)}")
    
    query = f"""
    UPDATE `ReachData` 
    SET 
        `EstTempC` = {round(row['est_temp'], 2)}
    WHERE
        (`ReachID` = (SELECT 
        ReachID
    FROM
        Reaches
    WHERE
        Name = '{row['Name']}'))
            AND (`Date` = '{row['Date']}');
    """

    cursor.execute(query)
    conn.conn.commit()

    # # Update the MajorRiverID column if the river exists in the Rivers table
    # query2 = f"""
    # UPDATE Basins
    # SET MajorRiverID = (SELECT RiverID FROM Rivers WHERE Name = '{row['MajorRiver']}')
    # WHERE Name = '{row['Name']}'
    # """

    # cursor.execute(query2)
    # conn.conn.commit()


Processing row 0 of 1270541
Processing row 10000 of 1270541
Processing row 20000 of 1270541
Processing row 30000 of 1270541
Processing row 40000 of 1270541
Processing row 50000 of 1270541
Processing row 60000 of 1270541
Processing row 70000 of 1270541
Processing row 80000 of 1270541
Processing row 90000 of 1270541
Processing row 100000 of 1270541
Processing row 110000 of 1270541
Processing row 120000 of 1270541
Processing row 130000 of 1270541
Processing row 140000 of 1270541
Processing row 150000 of 1270541
Processing row 160000 of 1270541
Processing row 170000 of 1270541
Processing row 180000 of 1270541
Processing row 190000 of 1270541
Processing row 200000 of 1270541
Processing row 210000 of 1270541
Processing row 220000 of 1270541
Processing row 230000 of 1270541
Processing row 240000 of 1270541
Processing row 250000 of 1270541
Processing row 260000 of 1270541
Processing row 270000 of 1270541
Processing row 280000 of 1270541
Processing row 290000 of 1270541
Processing row 300000 of