In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import json
import sys

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_squared_error, r2_score

from sklearn.model_selection import KFold, ShuffleSplit, RepeatedKFold, train_test_split, ParameterGrid
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import ElasticNetCV, ElasticNet

from joblib import dump, load

from permetrics.regression import RegressionMetric

In [None]:
proj_dir = Path('../..')

In [None]:
geopackage_fn = proj_dir / 'data/gis/geopackages/columbia_river_basin.gpkg'
# landsat_data_fn = proj_dir / 'Data/landsat/columbia_clean.csv'
landsat_data_fn = proj_dir / 'data/landsat/crb_landsat.csv'
insitu_data_dir = proj_dir / 'data/insitu/conditions/processed'
all_data_fn = proj_dir / 'methods/04-ml_development/input_data/ml_input_data_all.csv'

model_fn = proj_dir / 'methods/04-ml_development/var1/RFR1.joblib'

In [None]:
lsat_data = pd.read_csv(landsat_data_fn, parse_dates=['Date'])
all_data_ = pd.read_csv(all_data_fn, parse_dates=['Date'])

In [None]:

temp_locs_in_buffer = gpd.read_file(geopackage_fn, layer='TempLocationsInBuffer')

In [None]:
combined_insitu = pd.DataFrame()

for temp_loc in temp_locs_in_buffer.itertuples():
    insitu_fn = insitu_data_dir / f"{temp_loc.station_ID}.csv"
    if insitu_fn.exists():
        insitu_data = pd.read_csv(insitu_fn, parse_dates=['date'])
        # print(insitu_data.columns)
        # drop utc in date
        insitu_data['date'] = insitu_data['date'].dt.tz_localize(None)

        # keep date and average temperature
        # insitu_data = insitu_data[['date', 'avg_temp(C)', ]]
        # replace -999999 with nan
        # insitu_data.replace(-999999, np.nan, inplace=True)
        insitu_data.dropna(inplace=True)
        insitu_data['station_ID'] = temp_loc.station_ID
        insitu_data['RKm'] = temp_loc.RKm
        insitu_data['Name'] = temp_loc.reach_id
        insitu_data['GNIS_Name'] = temp_loc.GNIS_Name
        combined_insitu = pd.concat([combined_insitu, insitu_data])
        # break

# combined_insitu = pd.concat(combined_insitu)

combined_insitu.rename(columns={'date': 'Date'}, inplace=True)
# combined_insitu = combined_insitu.merge(lsat_data, on='Date', how='left')
# Data/insitu/conditions/processed
combined_insitu

In [None]:
insitu_lsat = pd.merge(
    lsat_data, combined_insitu, on=["Date", "Name", "RKm"], how="outer"
)
insitu_lsat_clean = insitu_lsat[
    # (insitu_lsat['avg_temp(C)'] >=0) &
    (insitu_lsat["WaterTempC"] >= -40)
    & (insitu_lsat["WaterTempC"] <= 80)
    # & (insitu_lsat["avg_temp(C)"] <= 40)
].copy()
insitu_lsat_clean[["WidthMin", "WidthMean", "WidthMax"]] = insitu_lsat_clean[
    ["WidthMin", "WidthMean", "WidthMax"]
].fillna(15)

insitu_lsat_clean["DOY"] = insitu_lsat_clean["Date"].dt.dayofyear

# insitu_lsat_clean.to_csv('ml_inputs_data.csv', index=False)

In [None]:
col_data = insitu_lsat_clean[insitu_lsat_clean["Name"].str.startswith("Columbia")].dropna(subset=['LandTempC', 'NDVI']).copy()

In [None]:
rfr = load(model_fn)

In [None]:
features = [
    "NDVI",
    "LandTempC",
    "ClimateClass",
    "DOY",
    # "WidthMin",
    "WidthMean",
    # "WidthMax",
    # "WaterTempC",
]

In [None]:
col_data['pred_temp'] = rfr.predict(col_data[features]) 

In [None]:
# col_test_ltm = col_data.groupby('RKm')

# fig, ax = plt.subplots(figsize=(15, 4))
# col_test_ltm.plot(x='RKm', y='pred_temp', ax=ax, label='Predicted')
# col_test_ltm.plot.scatter(x='RKm', y='avg_temp(C)', ax=ax, )
# # col_test_ltm.plot(x='RKm', y='diff', ax=ax, label='Observed')

In [None]:
# group by RKm and calculate the mean, std, median, min, max of LandTempC, pred_temp, ClimateClass
# operation on summer months only
# co = columbia['Date'].dt.month.isin([6, 7, 8])
co = col_data['Date'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
col_stats = col_data[co].groupby('RKm').agg({'LandTempC': ['mean', 'std', 'median', 'min', 'max'], 'pred_temp': ['mean', 'std', 'median', 'min', 'max'], 'ClimateClass': ['mean', 'std', 'median', 'min', 'max']})

fig, ax = plt.subplots(figsize=(15, 4))
col_stats['pred_temp'][['mean','median',]].plot(ax=ax)
ax.fill_between(col_stats.index, col_stats['pred_temp']['mean'] - col_stats['pred_temp']['std'], col_stats['pred_temp']['mean'] + col_stats['pred_temp']['std'], alpha=0.3)
ax.axvline(x=960, color='r', linestyle='--', label='Grand Coullee Dam,')
# ax.axvline(x=305, color='k', linestyle='--', label='Dalles Dam')
ax.axvline(x=230, color='b', linestyle='--', label='Bonneville Dam')
ax.axvline(x=1640, color='g', linestyle='--', label='Kinbasket Lake')
# ax.axvline(x=870, color='g', linestyle='--')
ax.legend(loc='lower left')
ax.set_xlabel('River Kilometer (km)')
ax.set_ylabel('Temperature (C)')

ax.set_title('Landsat Water Temperature (All Months)')

# find the line of bes fit and plot it
# lower columbia
x_lower = col_stats[col_stats.index <=230].index
y_lower = col_stats[col_stats.index <=230]['pred_temp']['mean']
m_lower, b_lower = np.polyfit(x_lower, y_lower, 1)
ax.plot(x_lower, m_lower*x_lower + b_lower, color='k', linestyle='--')

# mid columbia
x_middle = col_stats[(col_stats.index >230) & (col_stats.index <=960)].index
y_middle = col_stats[(col_stats.index >230) & (col_stats.index <=960)]['pred_temp']['mean']
m_mid, b_mid = np.polyfit(x_middle, y_middle, 1)
ax.plot(x_middle, m_mid*x_middle + b_mid, color='k', linestyle='--')

# upper columbia -1st part up to kinbasket lake
x_upper1 = col_stats[(col_stats.index >960) & (col_stats.index <=1640)].index
y_upper1 = col_stats[(col_stats.index >960) & (col_stats.index <=1640)]['pred_temp']['mean']
m_upper1, b_upper1 = np.polyfit(x_upper1, y_upper1, 1)
ax.plot(x_upper1, m_upper1*x_upper1 + b_upper1, color='k', linestyle='--')

# upper columbia -2nd part after kinbasket lake
x_upper2 = col_stats[col_stats.index >1640].index
y_upper2 = col_stats[col_stats.index >1640]['pred_temp']['mean']
m_upper2, b_upper2 = np.polyfit(x_upper2, y_upper2, 1)
ax.plot(x_upper2, m_upper2*x_upper2 + b_upper2, color='k', linestyle='--');



In [None]:
# group by RKm and calculate the mean, std, median, min, max of LandTempC, pred_temp, ClimateClass
# operation on summer months only
# co = columbia['Date'].dt.month.isin([6, 7, 8])
co = col_data["Date"].dt.month.isin(
    [  
        # 1,
        # 2,
        # 3,
        # 4,
        # 5,
        6,
        7,
        8,
        # 9,
        # 10,
        # 11,
        # 12
    ]
)
col_stats = (
    col_data[co]
    .groupby("RKm")
    .agg(
        {
            "avg_temp(C)": ["mean", "std", "median", "min", "max"],
            "pred_temp": ["mean", "std", "median", "min", "max"],
            "ClimateClass": ["mean", "std", "median", "min", "max"],
        }
    )
)

fig, ax = plt.subplots(figsize=(15, 4))
col_stats["pred_temp"][
    [
        "mean",
        # "median",
    ]
].plot(ax=ax, label="Estimated")
ax.fill_between(
    col_stats.index,
    col_stats["pred_temp"]["mean"] - col_stats["pred_temp"]["std"],
    col_stats["pred_temp"]["mean"] + col_stats["pred_temp"]["std"],
    alpha=0.3, label="±1 std",
)
ax.scatter(col_stats.index, col_stats["avg_temp(C)"]["mean"], label="Observed")
ax.axvline(x=960, color="r", linestyle="--", label="Grand Coullee Dam,")
# ax.axvline(x=305, color='k', linestyle='--', label='Dalles Dam')
ax.axvline(x=230, color="b", linestyle="--", label="Bonneville Dam")
ax.axvline(x=1640, color="g", linestyle="--", label="Kinbasket Lake")
# ax.axvline(x=870, color='g', linestyle='--')
ax.legend(loc="lower left")
ax.set_xlabel("River Kilometer (km)")
ax.set_ylabel("Temperature (C)")

ax.set_title("Landsat Water Temperature (All Months)")

# find the line of bes fit and plot it
# lower columbia
x_lower = col_stats[col_stats.index <= 230].index
y_lower = col_stats[col_stats.index <= 230]["pred_temp"]["mean"]
m_lower, b_lower = np.polyfit(x_lower, y_lower, 1)
ax.plot(x_lower, m_lower * x_lower + b_lower, color="k", linestyle="--")


# mid columbia
x_middle = col_stats[(col_stats.index > 230) & (col_stats.index <= 960)].index
y_middle = col_stats[(col_stats.index > 230) & (col_stats.index <= 960)]["pred_temp"][
    "mean"
]
m_mid, b_mid = np.polyfit(x_middle, y_middle, 1)
ax.plot(x_middle, m_mid * x_middle + b_mid, color="k", linestyle="--")

# upper columbia -1st part up to kinbasket lake
x_upper1 = col_stats[(col_stats.index > 960) & (col_stats.index <= 1640)].index
y_upper1 = col_stats[(col_stats.index > 960) & (col_stats.index <= 1640)]["pred_temp"][
    "mean"
]
m_upper1, b_upper1 = np.polyfit(x_upper1, y_upper1, 1)
ax.plot(x_upper1, m_upper1 * x_upper1 + b_upper1, color="k", linestyle="--")

# upper columbia -2nd part after kinbasket lake
x_upper2 = col_stats[col_stats.index > 1640].index
y_upper2 = col_stats[col_stats.index > 1640]["pred_temp"]["mean"]
m_upper2, b_upper2 = np.polyfit(x_upper2, y_upper2, 1)
ax.plot(x_upper2, m_upper2 * x_upper2 + b_upper2, color="k", linestyle="--")

In [None]:
seasons = {
    "spring": [3, 4, 5],
    "summer": [6, 7, 8],
    "fall": [9, 10, 11],
    "winter": [12, 1, 2],
}

fig, ax = plt.subplots(4, 1, figsize=(8, 9), sharex=True)

season_ax = {
    "spring": ax[0],
    "summer": ax[1],
    "fall": ax[2],
    "winter": ax[3],
}

season_titles = {
    # "spring": "a) Spring (March, April, May)",
    # "summer": "b) Summer (June, July, August)",
    # "fall": "c) Fall (September, October, November)",
    # "winter": "d) Winter (December, January, February)",
    "spring": "a) Spring (March, April, May)",
    "summer": "b) Summer (June, July, August)",
    "fall": "c) Fall (September, October, November)",
    "winter": "d) Winter (December, January, February)",
}

for season, months in seasons.items():
    co = col_data["Date"].dt.month.isin(months)
    col_stats = (
        col_data[co]
        .groupby("RKm")
        .agg(
            {
                "avg_temp(C)": ["mean", "std", "median", "min", "max"],
                "pred_temp": ["mean", "std", "median", "min", "max"],
                "ClimateClass": ["mean", "std", "median", "min", "max"],
            }
        )
    )

    # col_stats["pred_temp"][
    #     [
    #         "mean",
    #         # "median",
    #     ]
    # ].plot(ax=season_ax[season], label="Estimated")
    season_ax[season].plot(
        col_stats.index,
        col_stats["pred_temp"]["mean"],
        label="Estimated",
    )
    season_ax[season].fill_between(
        col_stats.index,
        col_stats["pred_temp"]["mean"] - col_stats["pred_temp"]["std"],
        col_stats["pred_temp"]["mean"] + col_stats["pred_temp"]["std"],
        alpha=0.3,
        label="±1 std",
    )
    season_ax[season].scatter(
        col_stats.index, col_stats["avg_temp(C)"]["mean"], label="Observed"
    )
    season_ax[season].axvline(x=960, color="r", linestyle="--", 
                            #   label="Grand Coullee Dam,"
                              )
    season_ax[season].axvline(x=230, color="b", linestyle="--", 
                              #label="Bonneville Dam"
                              )
    season_ax[season].axvline(x=1640, color="g", linestyle="--",
                               #label="Kinbasket Lake"
                               )
    # season_ax[season].legend(loc="lower left")
    # season_ax[season].set_xlabel("River Kilometer (km)")
    season_ax[season].set_ylabel("Temperature (°C)")

    season_ax[season].set_title(season_titles[season], loc="left")

    # find the line of bes fit and plot it
    # lower columbia
    x_lower = col_stats[col_stats.index <= 230].index
    y_lower = col_stats[col_stats.index <= 230]["pred_temp"]["mean"]
    m_lower, b_lower = np.polyfit(x_lower, y_lower, 1)
    season_ax[season].plot(
        x_lower, m_lower * x_lower + b_lower, color="k", linestyle="--"
    )

    # mid columbia
    x_middle = col_stats[(col_stats.index > 230
                            ) & (col_stats.index <= 960)].index
    y_middle = col_stats[(col_stats.index > 230) & (col_stats.index <= 960)][
        "pred_temp"
    ]["mean"]
    m_mid, b_mid = np.polyfit(x_middle, y_middle, 1)
    season_ax[season].plot(
        x_middle, m_mid * x_middle + b_mid, color="k", linestyle="--"
    )

    # upper columbia -1st part up to kinbasket lake
    x_upper1 = col_stats[(col_stats.index > 960
                            ) & (col_stats.index <= 1640)].index
    y_upper1 = col_stats[(col_stats.index > 960) & (col_stats.index <= 1640)][
        "pred_temp"
    ]["mean"]
    m_upper1, b_upper1 = np.polyfit(x_upper1, y_upper1, 1)
    season_ax[season].plot(
        x_upper1, m_upper1 * x_upper1 + b_upper1, color="k", linestyle="--"
    )

    # upper columbia -2nd part after kinbasket lake
    x_upper2 = col_stats[col_stats.index > 1640].index
    y_upper2 = col_stats[col_stats.index > 1640]["pred_temp"]["mean"]
    m_upper2, b_upper2 = np.polyfit(x_upper2, y_upper2, 1)
    season_ax[season].plot(
        x_upper2, m_upper2 * x_upper2 + b_upper2, color="k", linestyle="--"
    )

    print("Season:", season)
    print("Lower Columbia Slope:", m_lower)
    print("Middle Columbia Slope:", m_mid)
    print("Upper Columbia Slope:", m_upper1, m_upper2)

season_ax["winter"].set_xlabel("Distance from River Mouth - RKm (km)")
season_ax["spring"].legend()
fig.tight_layout()
fig.savefig("columbia_temp_estimation.png", dpi=300)

In [None]:
## Writing class assignment

seasons = {
    "spring": [3, 4, 5],
    "summer": [6, 7, 8],
    "fall": [9, 10, 11],
    "winter": [12, 1, 2],
}

fig, ax = plt.subplots(4, 1, figsize=(8, 9), sharex=True)

season_ax = {
    "spring": ax[0],
    "summer": ax[1],
    "fall": ax[2],
    "winter": ax[3],
}

season_titles = {
    # "spring": "a) Spring (March, April, May)",
    # "summer": "b) Summer (June, July, August)",
    # "fall": "c) Fall (September, October, November)",
    # "winter": "d) Winter (December, January, February)",
    "spring": "a) Spring (March, April, May)",
    "summer": "b) Summer (June, July, August)",
    "fall": "c) Fall (September, October, November)",
    "winter": "d) Winter (December, January, February)",
}

for season, months in seasons.items():
    co = col_data["Date"].dt.month.isin(months)
    col_stats = (
        col_data[co]
        .groupby("RKm")
        .agg(
            {
                "avg_temp(C)": ["mean", "std", "median", "min", "max"],
                "pred_temp": ["mean", "std", "median", "min", "max"],
                "ClimateClass": ["mean", "std", "median", "min", "max"],
            }
        )
    )

    # col_stats["pred_temp"][
    #     [
    #         "mean",
    #         # "median",
    #     ]
    # ].plot(ax=season_ax[season], label="Estimated")
    season_ax[season].plot(
        col_stats.index,
        col_stats["pred_temp"]["mean"],
        label="Estimated",
    )
    season_ax[season].fill_between(
        col_stats.index,
        col_stats["pred_temp"]["mean"] - col_stats["pred_temp"]["std"],
        col_stats["pred_temp"]["mean"] + col_stats["pred_temp"]["std"],
        alpha=0.3,
        label="±1 std",
    )
    season_ax[season].scatter(
        col_stats.index, col_stats["avg_temp(C)"]["mean"], label="Observed"
    )
    season_ax[season].axvline(x=960, color="#d95f02", linestyle="-", 
                            #   label="Grand Coullee Dam,"
                              )
    season_ax[season].axvline(x=230, color="#1b9e77", linestyle="-", 
                              #label="Bonneville Dam"
                              )
    season_ax[season].axvline(x=1640, color="#7570b3", linestyle="-",
                               #label="Kinbasket Lake"
                               )
    # season_ax[season].legend(loc="lower left")
    # season_ax[season].set_xlabel("River Kilometer (km)")
    season_ax[season].set_ylabel("Temperature (°C)")

    season_ax[season].set_title(season_titles[season], loc="left")

    # find the line of bes fit and plot it
    # lower columbia
    x_lower = col_stats[col_stats.index <= 230].index
    y_lower = col_stats[col_stats.index <= 230]["pred_temp"]["mean"]
    m_lower, b_lower = np.polyfit(x_lower, y_lower, 1)
    season_ax[season].plot(
        x_lower, m_lower * x_lower + b_lower, color="k", linestyle="--"
    )

    # mid columbia
    x_middle = col_stats[(col_stats.index > 230
                            ) & (col_stats.index <= 960)].index
    y_middle = col_stats[(col_stats.index > 230) & (col_stats.index <= 960)][
        "pred_temp"
    ]["mean"]
    m_mid, b_mid = np.polyfit(x_middle, y_middle, 1)
    season_ax[season].plot(
        x_middle, m_mid * x_middle + b_mid, color="k", linestyle="--"
    )

    # upper columbia -1st part up to kinbasket lake
    x_upper1 = col_stats[(col_stats.index > 960
                            ) & (col_stats.index <= 1640)].index
    y_upper1 = col_stats[(col_stats.index > 960) & (col_stats.index <= 1640)][
        "pred_temp"
    ]["mean"]
    m_upper1, b_upper1 = np.polyfit(x_upper1, y_upper1, 1)
    season_ax[season].plot(
        x_upper1, m_upper1 * x_upper1 + b_upper1, color="k", linestyle="--"
    )

    # upper columbia -2nd part after kinbasket lake
    x_upper2 = col_stats[col_stats.index > 1640].index
    y_upper2 = col_stats[col_stats.index > 1640]["pred_temp"]["mean"]
    m_upper2, b_upper2 = np.polyfit(x_upper2, y_upper2, 1)
    season_ax[season].plot(
        x_upper2, m_upper2 * x_upper2 + b_upper2, color="k", linestyle="--"
    )

    print("Season:", season)
    print("Lower Columbia Slope:", m_lower)
    print("Middle Columbia Slope:", m_mid)
    print("Upper Columbia Slope:", m_upper1, m_upper2)

season_ax["winter"].set_xlabel("Distance from River Mouth - RKm (km)")
season_ax["spring"].legend()
fig.tight_layout()
fig.savefig("columbia_temp_estimation.png", dpi=300)

In [None]:
seasons = {
    "spring": [3, 4, 5],
    "summer": [6, 7, 8],
    "fall": [9, 10, 11],
    "winter": [12, 1, 2],
}

fig, ax = plt.subplots(4, 1, figsize=(4, 9), sharex=True)

season_ax = {
    "spring": ax[0],
    "summer": ax[1],
    "fall": ax[2],
    "winter": ax[3],
}

season_titles = {
    # "spring": "a) Spring (March, April, May)",
    # "summer": "b) Summer (June, July, August)",
    # "fall": "c) Fall (September, October, November)",
    # "winter": "d) Winter (December, January, February)",
    "spring": "e)", # Spring (March, April, May)",
    "summer": "f)", # Summer (June, July, August)",
    "fall": "g)", # Fall (September, October, November)",
    "winter": "h)", # Winter (December, January, February)",
}

for season, months in seasons.items():
    co = col_data["Date"].dt.month.isin(months)
    col_stats = (
        col_data[co]
        .groupby("RKm")
        .agg(
            {
                "avg_temp(C)": ["mean", "std", "median", "min", "max"],
                "pred_temp": ["mean", "std", "median", "min", "max"],
                "ClimateClass": ["mean", "std", "median", "min", "max"],
            }
        )
    )

    # col_stats["pred_temp"][
    #     [
    #         "mean",
    #         # "median",
    #     ]
    # ].plot(ax=season_ax[season], label="Estimated")
    season_ax[season].plot(
        col_stats.index,
        col_stats["pred_temp"]["mean"],
        label="Estimated",
    )
    season_ax[season].fill_between(
        col_stats.index,
        col_stats["pred_temp"]["mean"] - col_stats["pred_temp"]["std"],
        col_stats["pred_temp"]["mean"] + col_stats["pred_temp"]["std"],
        alpha=0.3,
        label="±1 std",
    )
    season_ax[season].scatter(
        col_stats.index, col_stats["avg_temp(C)"]["mean"], label="Observed"
    )
    season_ax[season].axvline(x=960, color="#d95f02", linestyle="-", label="Grand Coullee Dam,")
    season_ax[season].axvline(x=230, color="b", linestyle="--", label="Bonneville Dam")
    season_ax[season].axvline(x=1640, color="g", linestyle="--", label="Kinbasket Lake")
    # season_ax[season].legend(loc="lower left")
    # season_ax[season].set_xlabel("River Kilometer (km)")
    # season_ax[season].set_ylabel("Temperature (C)")
    # turn_off_the_y_labels
    season_ax[season].set_yticklabels([])

    season_ax[season].set_title(season_titles[season], loc="left")

    # find the line of bes fit and plot it
    # lower columbia
    x_lower = col_stats[col_stats.index <= 230].index
    y_lower = col_stats[col_stats.index <= 230]["pred_temp"]["mean"]
    m_lower, b_lower = np.polyfit(x_lower, y_lower, 1)
    # season_ax[season].plot(
    #     x_lower, m_lower * x_lower + b_lower, color="k", linestyle="--"
    # )

    # mid columbia
    x_middle = col_stats[(col_stats.index > 230
                            ) & (col_stats.index <= 960)].index
    y_middle = col_stats[(col_stats.index > 230) & (col_stats.index <= 960)][
        "pred_temp"
    ]["mean"]
    m_mid, b_mid = np.polyfit(x_middle, y_middle, 1)
    # season_ax[season].plot(
    #     x_middle, m_mid * x_middle + b_mid, color="k", linestyle="--"
    # )

    # upper columbia -1st part up to kinbasket lake
    x_upper1 = col_stats[(col_stats.index > 960
                            ) & (col_stats.index <= 1640)].index
    y_upper1 = col_stats[(col_stats.index > 960) & (col_stats.index <= 1640)][
        "pred_temp"
    ]["mean"]
    m_upper1, b_upper1 = np.polyfit(x_upper1, y_upper1, 1)
    # season_ax[season].plot(
    #     x_upper1, m_upper1 * x_upper1 + b_upper1, color="k", linestyle="--"
    # )

    # upper columbia -2nd part after kinbasket lake
    x_upper2 = col_stats[col_stats.index > 1640].index
    y_upper2 = col_stats[col_stats.index > 1640]["pred_temp"]["mean"]
    m_upper2, b_upper2 = np.polyfit(x_upper2, y_upper2, 1)
    # season_ax[season].plot(
    #     x_upper2, m_upper2 * x_upper2 + b_upper2, color="k", linestyle="--"
    # )

    season_ax[season].set_xlim(900, 1020)

    print("Season:", season)
    print(f"Lower Columbia Slope: {m_lower*1e2:.2f}", )
    print(f"Middle Columbia Slope: {m_mid*1e2:.2f}")
    print(f"Upper Columbia Slope: {m_upper1*1e2:.2f}, {m_upper2*1e2:.2f}")

season_ax["winter"].set_xlabel("Distance from River Mouth - RKm (km)")
fig.tight_layout()
fig.savefig("columbia_temp_estimation_zoomed.png", dpi=300)

In [None]:
seasons = {
    "spring": [3, 4, 5],
    "summer": [6, 7, 8],
    "fall": [9, 10, 11],
    "winter": [12, 1, 2],
}

fig, ax = plt.subplots(4, 1, figsize=(8, 9), sharex=True)

season_ax = {
    "spring": ax[0],
    "summer": ax[1],
    "fall": ax[2],
    "winter": ax[3],
}

season_titles = {
    # "spring": "a) Spring (March, April, May)",
    # "summer": "b) Summer (June, July, August)",
    # "fall": "c) Fall (September, October, November)",
    # "winter": "d) Winter (December, January, February)",
    "spring": "a) Spring (March, April, May)",
    "summer": "b) Summer (June, July, August)",
    "fall": "c) Fall (September, October, November)",
    "winter": "d) Winter (December, January, February)",
}

for season, months in seasons.items():
    co = col_data["Date"].dt.month.isin(months)
    col_stats = (
        col_data[co]
        .groupby("RKm")
        .agg(
            {
                "avg_temp(C)": ["mean", "std", "median", "min", "max"],
                "pred_temp": ["mean", "std", "median", "min", "max"],
                "WaterTempC": ["mean", "std", "median", "min", "max"],
            }
        )
    )

    # col_stats["pred_temp"][
    #     [
    #         "mean",
    #         # "median",
    #     ]
    # ].plot(ax=season_ax[season], label="Estimated")
    season_ax[season].plot(
        col_stats.index,
        col_stats["pred_temp"]["mean"],
        label="Estimated",
    )
    season_ax[season].fill_between(
        col_stats.index,
        col_stats["pred_temp"]["mean"] - col_stats["pred_temp"]["std"],
        col_stats["pred_temp"]["mean"] + col_stats["pred_temp"]["std"],
        alpha=0.3,
        label="±1 std",
    )
    season_ax[season].plot(
        col_stats.index, col_stats["WaterTempC"]["mean"], label="Observed"
    )
    season_ax[season].scatter(
        col_stats.index, col_stats["avg_temp(C)"]["mean"], label="Observed"
    )
    season_ax[season].axvline(x=960, color="r", linestyle="--", 
                            #   label="Grand Coullee Dam,"
                              )
    season_ax[season].axvline(x=230, color="b", linestyle="--", 
                              #label="Bonneville Dam"
                              )
    season_ax[season].axvline(x=1640, color="g", linestyle="--",
                               #label="Kinbasket Lake"
                               )
    # season_ax[season].legend(loc="lower left")
    # season_ax[season].set_xlabel("River Kilometer (km)")
    season_ax[season].set_ylabel("Temperature (°C)")

    season_ax[season].set_title(season_titles[season], loc="left")

    # find the line of bes fit and plot it
    # lower columbia
    x_lower = col_stats[col_stats.index <= 230].index
    y_lower = col_stats[col_stats.index <= 230]["pred_temp"]["mean"]
    m_lower, b_lower = np.polyfit(x_lower, y_lower, 1)
    season_ax[season].plot(
        x_lower, m_lower * x_lower + b_lower, color="k", linestyle="--"
    )

    # mid columbia
    x_middle = col_stats[(col_stats.index > 230
                            ) & (col_stats.index <= 960)].index
    y_middle = col_stats[(col_stats.index > 230) & (col_stats.index <= 960)][
        "pred_temp"
    ]["mean"]
    m_mid, b_mid = np.polyfit(x_middle, y_middle, 1)
    season_ax[season].plot(
        x_middle, m_mid * x_middle + b_mid, color="k", linestyle="--"
    )

    # upper columbia -1st part up to kinbasket lake
    x_upper1 = col_stats[(col_stats.index > 960
                            ) & (col_stats.index <= 1640)].index
    y_upper1 = col_stats[(col_stats.index > 960) & (col_stats.index <= 1640)][
        "pred_temp"
    ]["mean"]
    m_upper1, b_upper1 = np.polyfit(x_upper1, y_upper1, 1)
    season_ax[season].plot(
        x_upper1, m_upper1 * x_upper1 + b_upper1, color="k", linestyle="--"
    )

    # upper columbia -2nd part after kinbasket lake
    x_upper2 = col_stats[col_stats.index > 1640].index
    y_upper2 = col_stats[col_stats.index > 1640]["pred_temp"]["mean"]
    m_upper2, b_upper2 = np.polyfit(x_upper2, y_upper2, 1)
    season_ax[season].plot(
        x_upper2, m_upper2 * x_upper2 + b_upper2, color="k", linestyle="--"
    )

    print("Season:", season)
    print("Lower Columbia Slope:", m_lower)
    print("Middle Columbia Slope:", m_mid)
    print("Upper Columbia Slope:", m_upper1, m_upper2)

season_ax["winter"].set_xlabel("Distance from River Mouth - RKm (km)")
season_ax["spring"].legend()
fig.tight_layout()

# Reconstruction all


In [None]:
all_data = all_data_.copy()

In [None]:
all_data_[all_data_['Mission']=='L5']['Name'].unique()

In [None]:
all_data = all_data[['Date', 'Name','RKm', 'LandTempC', 'DOY', 'NDVI', 'WaterTempC', 'ClimateClass', 'WidthMean', 'avg_temp(C)']].copy()
all_data = all_data.dropna(subset=['LandTempC', 'NDVI'])
all_data = all_data[(all_data['WaterTempC']) >= -40 & (all_data['WaterTempC'] <= 80)].copy()
all_data['pred_temp'] = rfr.predict(all_data[features])

In [None]:
all_data.sort_values(['Name', 'RKm', 'Date'], inplace=True)

In [None]:
test_reaches = {
    1: "Columbia_River_96",
    2: "Kootenay_River_35",
    3: "Okanogan_River_13",
    4: "Willamette_River_20",
}

insitu_fp = {
    1: proj_dir / 'Data/insitu/conditions/processed/USBR_GCGW.csv',
    2: proj_dir / 'Data/insitu/conditions/processed/USGS_12301933.csv',
    3: proj_dir / 'Data/insitu/conditions/processed/USGS_12439500.csv',
    4: proj_dir / 'Data/insitu/conditions/processed/USGS_14174000.csv',
}

In [None]:
Columbia_River_96_insitu = pd.read_csv(insitu_fp[1], parse_dates=['date'])

In [None]:
# replace negative avg_temp(C) with nan
Columbia_River_96_insitu['avg_temp(C)'] = Columbia_River_96_insitu['avg_temp(C)'].apply(lambda x: np.nan if x < 0 else x)
Columbia_River_96_insitu.to_csv(insitu_fp[1], index=False)

In [None]:
fig, axs = plt.subplots(4, 1, figsize=(12, 7), sharex=True)

for i, name in test_reaches.items():
    pred_temp = all_data[all_data['Name'] == name]
    obs_temp = pd.read_csv(insitu_fp[i], parse_dates=['date'])
    obs_temp['date'] = obs_temp['date'].dt.tz_localize(None)
    obs_temp = obs_temp[['date', 'avg_temp(C)']]

    ax = axs[i-1]
    ax.scatter(pred_temp['Date'], pred_temp['pred_temp'], label='Estimated', s=1.5, color='r', zorder=2)
    ax.plot(obs_temp['date'], obs_temp['avg_temp(C)'], label='In-situ', zorder=1)

    # ax.set_title(name)

    # ax.set_xlim(pd.Timestamp('1998-01-01'), pd.Timestamp('1998-12-31'))

# add titles
axs[0].set_title("a) Test Reach 1 - Columbia River, Below Grand Coulee Dam, WA", loc=  'left')
axs[1].set_title("b) Test Reach 2 - Kootenay River, Below Libby Dam, MT", loc='left')
axs[2].set_title("c) Test Reach 3 - Okanogan River, Below Osoyoos Lake, WA", loc='left')
axs[3].set_title("d) Test Reach 4 - Willamette River, At Albany, OR", loc= 'left')

# add y labels
axs[0].set_ylabel("Temperature (°C)")
axs[1].set_ylabel("Temperature (°C)")
axs[2].set_ylabel("Temperature (°C)")
axs[3].set_ylabel("Temperature (°C)")

axs[3].set_xlabel("Date")
axs[3].legend(loc='lower left') 

fig.tight_layout()
fig.savefig('test_reaches.png', dpi=300)

In [None]:
Snake_River_128 = all_data[all_data['Name'] == 'Snake_River_128']

# create a daily daterange for the entire period
date_range = pd.date_range(start=Snake_River_128['Date'].min(), end=Snake_River_128['Date'].max(), freq='D')

# create a dataframe with the date range
Snake_River_128_daily = pd.DataFrame(date_range, columns=['Date'])

# merge the daily date range with the Snake River 128 data
Snake_River_128_daily = pd.merge(Snake_River_128_daily, Snake_River_128[['Date', 'pred_temp']], on='Date', how='left')



In [None]:
Snake_River_128_daily.plot.scatter(x='Date', y='pred_temp')

In [None]:
fig, ax = plt.subplots(figsize=(15, 4))
Snake_River_128_daily.plot(x='Date', y='pred_temp', ax=ax, label='Predicted')

In [None]:
all_data['Date'] = pd.to_datetime(all_data['Date'])

In [None]:
all_data[all_data['Name'] == 'Snake_River_128'].drop('avg_temp(C)', axis=1).plot(x='Date', y='WaterTempC', label='Observed',)

In [None]:
fig, ax = plt.subplots(figsize=(15, 2))
all_data[all_data['Name'] == 'Snake_River_128'].resample('D', on='Date').mean().reset_index().plot(x='Date', y='pred_temp', ax=ax, label='Estimated', #s=1 
                                                                                        )
all_data[all_data['Name'] == 'Snake_River_128'].plot(x='Date', y='pred_temp', ax=ax, label='Estimated', #s=1 
                                                                                        )
# ax.set_xlim('2000-01-01', '2023-12-31')

In [None]:
all_data

In [None]:
lsat_data = pd.read_csv(landsat_data_fn, parse_dates=['Date'])

In [None]:
lsat_data[lsat_data['Mission']=='L5']['Name'].unique()