In [1]:
import sys
import geemap
import ee
from pathlib import Path
import pandas as pd
import geopandas as gpd
import numpy as np
import os
from pathlib import Path
import time
from random import randint
import json


from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold, ShuffleSplit, RepeatedKFold, train_test_split, GridSearchCV
from sklearn.linear_model import ElasticNetCV, ElasticNet
from sklearn.ensemble import RandomForestRegressor

import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

# import tensorflow as tf
import HydroErr as he
import pickle



In [2]:
proj_dir = Path("../../..")
model_name = "RFR1"

In [3]:
utils = str(proj_dir / 'utils')
sys.path.insert(0, utils)
from sql import connect # utility functions for connecting to MySQL

In [4]:
# Create a connection object to the MySQL database
# conn = connect.Connect(str(proj_dir / "Methods/2.Data/DBManagement/mysql_config.ini"))
conn = connect.Connect(str(proj_dir / ".env/mysql_config.ini"))
connection = conn.conn

Connecting to MySQL database...
Connection established.


In [5]:
query = f"""
SELECT 
    STR_TO_DATE(CONCAT(Year,
                    '-',
                    LPAD(Month, 2, '00'),
                    '-',
                    LPAD(DayOfMonth, 2, '00')),
            '%Y-%m-%d') AS Date,
    Month,
    DayOfMonth,
    ROUND(WaterTemp, 2) as WaterTemp,
    ROUND(LandTemp, 2) as LandTemp,
    ROUND(NDVI, 2) as NDVI,
    ClimateClass,
    --     ROUND(((watertemp - WaterTemperature) / WaterTemperature),
    --             2) AS PercentDeviation,
    --     ROUND((watertemp - WaterTemperature), 2) AS Deviation,
    Width,
    ReachID,
    ReachName
    -- ROUND(InsituTemp, 2) AS InsituTemp
FROM
    (SELECT 
        IF(DAY(ReachLandsatWaterTemp.date) < 15, 1, 15) AS DayOfMonth,
            MONTH(ReachLandsatWaterTemp.date) AS Month,
            YEAR(ReachLandsatWaterTemp.date) AS Year,
            AVG(ReachLandsatWaterTemp.Value) AS WaterTemp,
            AVG(ReachLandsatLandTemp.Value) AS LandTemp,
            AVG(ReachNDVI.Value) AS NDVI,
            IFNULL(Reaches.WidthMean, 30) AS Width,
            Reaches.ClimateClass AS ClimateClass,
            ReachLandsatWaterTemp.ReachID AS ReachID,
            Reaches.Name AS ReachName
    FROM
        ReachLandsatWaterTemp
    INNER JOIN ReachLandsatLandTemp USING (date , ReachID)
    INNER JOIN ReachNDVI USING (date , ReachID)
    INNER JOIN Reaches USING (ReachID)
    -- WHERE
    --        AND ReachLandsatWaterTemp.Value > 0
    GROUP BY DayOfMonth , Month , Year , ClimateClass , ReachID , Width) AS T
--     --         INNER JOIN
--     --     ReachLandsatLTMSemiMonthly USING (DayOfMonth , Month , ReachID)
--         LEFT JOIN
--     (SELECT 
--         IF(DAY(ReachInsituWaterTemp.date) < 15, 1, 15) AS DayOfMonth,
--             MONTH(ReachInsituWaterTemp.date) AS Month,
--             YEAR(ReachInsituWaterTemp.date) AS Year,
--             AVG(ReachInsituWaterTemp.Value) AS InsituTemp,
--             ReachInsituWaterTemp.ReachID AS ReachID
--     FROM
--         ReachInsituWaterTemp
--     INNER JOIN Reaches USING (ReachID)
--     WHERE
--         ReachInsituWaterTemp.Value > 0
--     GROUP BY DayOfMonth , Month , Year , ReachID) AS I USING (DayOfMonth , Month , Year , ReachID)
-- ORDER BY RAND();
""" 
# print(query)

df = conn.query_with_fetchmany(query, chunksize=100)

df.to_csv(proj_dir / "Methods/3.WaterTempEst/5.HydrothermalHistory/ML_input_data.csv", index=False)

df = pd.read_csv(proj_dir / "Methods/3.WaterTempEst/5.HydrothermalHistory/ML_input_data.csv")

In [6]:
# df = pd.concat([chunk for chunk in data], ignore_index=True)
df['Date'] = pd.to_datetime(df['Date'])
df['DayOfYear'] = df['Date'].dt.dayofyear

dels = pd.read_csv(proj_dir / "Methods/3.WaterTempEst/rat_dels.csv")
dels["Date"] = pd.to_datetime(dels["Date"])
sarea = pd.read_csv(proj_dir / "Methods/3.WaterTempEst/rat_sarea.csv")
sarea["Date"] = pd.to_datetime(sarea["Date"])

In [7]:
reaches_and_dams = pd.read_csv(proj_dir / "Methods/3.WaterTempEst/reaches_and_dams.csv")

In [8]:
df = df.merge(reaches_and_dams, on="ReachName", how="left")
df = df.merge(dels, on=["GranD_ID", "Date"], how="left")
df = df.merge(sarea, on=["GranD_ID", "Date"], how="left")

df.rename(columns={"dS (m3)": "dels", "area (km2)": "sarea", "Within_n*10km": "rel_dist"}, inplace=True)

original_cols = df.columns
# df.head()

In [9]:
model_name = "RFR1"

# load scalers and model
with open(proj_dir / f"Results/2.WaterTempEst/scalers.pkl", "rb") as f:
    scalers = pickle.load(f)

    dayofmonth_scaler = scalers["dayofmonth_scaler"]
    month_scaler = scalers["month_scaler"]
    watertemp_scaler =  scalers["watertemp_scaler"]
    landtemp_scaler = scalers["landtemp_scaler"]
    width_scaler = scalers["width_scaler"]
    NDVI_scaler = scalers["NDVI_scaler"]
    climate_scaler = scalers["climate_scaler"]
    dels_scaler =   scalers["dels_scaler"]
    sarea_scaler = scalers["sarea_scaler"]
    rel_dist_scaler = scalers["rel_dist_scaler"]

with open(proj_dir / f"Results/2.WaterTempEst/2.RandomForestRegression/{model_name}_final_model.pkl", "rb") as f:
    model_final = pickle.load(f)

In [10]:
# replace missing values for dels, sarea, and rel_dist with the 0
df["dels"].fillna(0, inplace=True)
df["sarea"].fillna(0, inplace=True)
df["rel_dist"].fillna(0, inplace=True)

# Scale values
df["DayOfMonth_scaled"] = dayofmonth_scaler.transform(df[["DayOfMonth"]])
df["Month_scaled"] = month_scaler.transform(df[["Month"]])
df["LandTemp_scaled"] = landtemp_scaler.transform(df[["LandTemp"]])
df["WaterTemp_scaled"] = watertemp_scaler.transform(df[["WaterTemp"]])
df["Width_scaled"] = width_scaler.transform(df[["Width"]])
df["NDVI_scaled"] = NDVI_scaler.transform(df[["NDVI"]])
df["ClimateClass_scaled"] = climate_scaler.transform(df[["ClimateClass"]])
df["dels_scaled"] = dels_scaler.transform(df[["dels"]])
df["sarea_scaled"] = sarea_scaler.transform(df[["sarea"]])
df["rel_dist_scaled"] = rel_dist_scaler.transform(df[["rel_dist"]])

## Using variation 1

In [11]:
# X and y to be used for prediction
X = df[
    [
        "DayOfMonth_scaled",
        "Month_scaled",
        "LandTemp_scaled",
        "Width_scaled",
        "NDVI_scaled",
        "ClimateClass_scaled",
        # "dels_scaled",
        # "sarea_scaled",
        # "rel_dist_scaled",
    ]
]

In [12]:
df["est1"] = model_final.predict(X)
df.sort_values(by=['ReachID', 'Date'], inplace=True)

In [13]:
# insert estimates into database
cursor = connection.cursor()

for i, row in df.iterrows():
    query = f"""
    INSERT INTO ReachEstimatedWaterTemp (Date, ReachID, Value, Tag)
    SELECT '{row['Date'].date()}', (SELECT ReachID FROM Reaches WHERE ReachID = {row['ReachID']}), {row['est1']}, "SM"
    WHERE NOT EXISTS (SELECT * FROM ReachEstimatedWaterTemp WHERE Date = '{row['Date'].date()}' AND ReachID = {row['ReachID']}  AND Tag = "SM");
    """

    # print(query)
    # break

    cursor.execute(query)
    conn.conn.commit()

    query = f"""
    UPDATE ReachEstimatedWaterTemp
    SET Value = {row['est1']}
    WHERE Date = '{row['Date']}' AND ReachID = {row['ReachID']} AND Tag = "SM";
    """

    cursor.execute(query)
    conn.conn.commit()
    # break

## Using variation 2

In [14]:
df2 = df[(df["rel_dist"] != 0)
].copy()
X2 = df2[
    [
        "DayOfMonth_scaled",
        "Month_scaled",
        "LandTemp_scaled",
        "Width_scaled",
        "NDVI_scaled",
        "ClimateClass_scaled",
        "dels_scaled",
        "sarea_scaled",
        "rel_dist_scaled",
    ]
].copy()

In [15]:
X2

Unnamed: 0,DayOfMonth_scaled,Month_scaled,LandTemp_scaled,Width_scaled,NDVI_scaled,ClimateClass_scaled,dels_scaled,sarea_scaled,rel_dist_scaled
96738,1.0,0.181818,-0.711351,0.000000,1.051956,0.586207,0.260432,0.034802,0.2
104807,1.0,0.272727,-0.449971,0.000000,0.995047,0.586207,0.260432,0.034802,0.2
114229,1.0,0.363636,-0.230413,0.000000,0.710501,0.586207,0.260432,0.034802,0.2
30437,0.0,0.454545,0.593307,0.000000,0.995047,0.586207,0.260432,0.034802,0.2
38999,0.0,0.545455,0.532069,0.000000,1.165775,0.586207,0.260432,0.034802,0.2
...,...,...,...,...,...,...,...,...,...
75453,0.0,0.818182,-0.187098,0.011905,0.369045,0.206897,0.260432,0.034802,1.0
170052,1.0,0.818182,-0.976465,0.011905,0.141408,0.206897,0.260432,0.034802,1.0
81669,0.0,0.909091,-1.324473,0.011905,-0.313866,0.206897,0.260432,0.034802,1.0
176712,1.0,0.909091,-1.089231,0.011905,-0.370775,0.206897,0.260432,0.034802,1.0


In [16]:
model_name2 = "RFR2_1"
model_final2 = pickle.load(open("/Users/gdarkwah/Library/CloudStorage/OneDrive-UW/01-Research/01-THORR/Results/2.WaterTempEst/2.RandomForestRegression/RFR2_1_final_model.pkl", "rb"))

In [17]:
model_final2

In [18]:
df2["est2"] = model_final2.predict(X2)

In [19]:
# insert estimates into database
cursor = connection.cursor()

for i, row in df2.iterrows():
    query = f"""
    INSERT INTO ReachEstimatedWaterTemp (Date, ReachID, Value, Tag)
    SELECT '{row['Date'].date()}', (SELECT ReachID FROM Reaches WHERE ReachID = {row['ReachID']}), {row['est2']}, "SM"
    WHERE NOT EXISTS (SELECT * FROM ReachEstimatedWaterTemp WHERE Date = '{row['Date'].date()}' AND ReachID = {row['ReachID']}  AND Tag = "SM");
    """

    # print(query)
    # break

    cursor.execute(query)
    conn.conn.commit()

    query = f"""
    UPDATE ReachEstimatedWaterTemp
    SET Value = {row['est2']}
    WHERE Date = '{row['Date']}' AND ReachID = {row['ReachID']} AND Tag = "SM";
    """

    cursor.execute(query)
    conn.conn.commit()
    # break