In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import json
import sys

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_squared_error, r2_score

from sklearn.model_selection import KFold, ShuffleSplit, RepeatedKFold, train_test_split, ParameterGrid
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import ElasticNetCV, ElasticNet

from joblib import dump, load

from permetrics.regression import RegressionMetric

In [None]:
proj_dir = Path('../../..')
seed = 1993

In [None]:
ml_input_data = pd.read_csv(proj_dir / 'methods/04-ml_development/input_data/ml_input_data.csv')

In [None]:
# shuffle the data and divide into training and testing
ml_input_data = ml_input_data.sample(frac=1).reset_index(drop=True)
ml_input_data.dropna(subset=['LandTempC', 'NDVI'], inplace=True)

# ml_input_data.sort_values(by='Date')


In [None]:
features = [
    "NDVI",
    "LandTempC",
    "ClimateClass",
    "DOY",
    # "WidthMin",
    "WidthMean",
    # "WidthMax",
    # "WaterTempC",
]
y_col = "avg_temp(C)"

In [None]:
model_name = "LR1"

cv_splitter = RepeatedKFold(n_splits=5, 
                    n_repeats=10, 
                    random_state=seed
                    )

In [None]:
dev_set, test_set = train_test_split(ml_input_data, test_size=0.2, random_state=seed)

# add the data from the handpicked reaches to the test set
dev_set = dev_set[
    ~(
        (
            (dev_set["Name"] == "Okanogan_River_13")
            | (dev_set["Name"] == "Columbia_River_96")
            | (dev_set["Name"] == "Kootenay_River_35")
            | (dev_set["Name"] == "Willamette_River_20")
        )
        & (dev_set["Date"] > "2020-01-01")
    )
].copy()

test_set = pd.concat(
    [test_set, dev_set[
        (
            (dev_set["Name"] == "Okanogan_River_13")
            | (dev_set["Name"] == "Columbia_River_96")
            | (dev_set["Name"] == "Kootenay_River_35")
            | (dev_set["Name"] == "Willamette_River_20")
        )
        & (dev_set["Date"] > "2020-01-01")
    ]],
    
).copy()

In [None]:
hyperparmeters = {
    "l1_ratio": [0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.925, 0.95, 0.975, 1],
    "alpha": np.arange(0.01, 1, 0.01),
    # 'alpha': [0.01],
}

list_metrics = ["RMSE", "MAE", "NSE", "R2", "KGE", "MSE"]

dev_results = pd.DataFrame(columns=["parameters", "combination", "fold"] + list_metrics)

test_results = pd.DataFrame(columns=["parameters", "combination"] + list_metrics)

# for i, l1_ratio in enumerate(hyperparmeters['l1_ratio']):
for i, params in enumerate(ParameterGrid(hyperparmeters)):
    l1_ratio = params["l1_ratio"]
    alpha = params["alpha"]
    for j, (train_idx, val_idx) in enumerate(cv_splitter.split(dev_set)):
        train_set = dev_set.iloc[train_idx].copy()
        val_set = dev_set.iloc[val_idx].copy()

        X_train = train_set[features]
        y_train = train_set[y_col]

        X_val = val_set[features]
        y_val = val_set[y_col]

        model = ElasticNet(l1_ratio=l1_ratio, random_state=seed, alpha=alpha)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_val)

        val_set["y_pred"] = y_pred

        # mse = mean_squared_error(y_val, y_pred)
        # rmse = np.sqrt(mse)
        # r2 = r2_score(y_val, y_pred)
        # # nse = 1 - mse / np.var(y_val)
        # mae = np.mean(np.abs(y_val - y_pred))

        evaluator = RegressionMetric(list(y_val), list(y_pred))

        dev_results = pd.concat(
            [
                dev_results,
                pd.DataFrame(
                    [
                        [params, i, j]
                        + list(
                            evaluator.get_metrics_by_list_names(list_metrics).values()
                        )
                    ],
                    columns=["parameters", "combination", "fold"] + list_metrics,
                    index=[j],
                ),
            ]
        )

    X_dev = dev_set[features]
    y_dev = dev_set[y_col]

    X_test = test_set[features]
    y_test = test_set[y_col]

    model = ElasticNet(l1_ratio=l1_ratio, random_state=seed, alpha=alpha)
    model.fit(X_dev, y_dev)

    y_pred = model.predict(X_test)

    test_set["y_pred"] = y_pred

    # mse = mean_squared_error(y_test, y_pred)
    # rmse = np.sqrt(mse)
    # r2 = r2_score(y_test, y_pred)
    # # nse = 1 - mse / np.var(y_val)
    # mae = np.mean(np.abs(y_test - y_pred))

    evaluator = RegressionMetric(list(y_test), list(y_pred))

    test_results = pd.concat(
        [
            test_results,
            pd.DataFrame(
                [
                    [params, i]
                    + list(evaluator.get_metrics_by_list_names(list_metrics).values())
                ],
                columns=["parameters", "combination"] + list_metrics,
                index=[i],
            ),
        ]
    )

    # if mse is the minimum in the test results, save the model, parameters and test results
    if evaluator.MSE() == test_results.MSE.min():
        dump(model, f"{model_name}_model.joblib")
        with open(f"{model_name}_params.json", "w") as f:
            json.dump(params, f)

        test_set.to_csv(f"{model_name}_test_set.csv", index=False)

test_results['l1_ratio'] = test_results['parameters'].apply(lambda x: x['l1_ratio'])
test_results['alpha'] = test_results['parameters'].apply(lambda x: x['alpha'])
dev_results['l1_ratio'] = dev_results['parameters'].apply(lambda x: x['l1_ratio'])
dev_results['alpha'] = dev_results['parameters'].apply(lambda x: x['alpha'])

dev_results.to_csv(f"{model_name}_dev_results.csv", index=False)
test_results.to_csv(f"{model_name}_test_results.csv", index=False)

In [None]:
test_results['parameters'][0]

In [None]:
# scatter plot of the test results
fig, ax = plt.subplots(1,1, figsize=(5, 5))
test_set.plot.scatter(x='avg_temp(C)', y='y_pred', ax=ax, s=.75)
ax.plot([0, 30], [0, 30], color='k', linestyle='--')
ax.set_xlabel('In-situ Water Temperature (C)')
ax.set_ylabel('Estimated Water Temperature (C)')
ax.set_title('Linear Regression')
# ax.set_title('ElasticNet Linear Regression')

mae, mse, rmse, r2, nse, kge = test_results[test_results["MSE"] == test_results.MSE.min()].iloc[0][['MAE', 'MSE', 'RMSE', 'R2', 'NSE', 'KGE']]

ax.annotate(f'MAE: {mae:.2f}', xy=(0.05, 0.9), xycoords='axes fraction')
ax.annotate(f'MSE: {mse:.2f}', xy=(0.05, 0.85), xycoords='axes fraction')
ax.annotate(f'RMSE: {rmse:.2f}', xy=(0.05, 0.8), xycoords='axes fraction')
ax.annotate(f'R2: {r2:.2f}', xy=(0.05, 0.75), xycoords='axes fraction')
ax.annotate(f'NSE: {nse:.2f}', xy=(0.05, 0.7), xycoords='axes fraction')
ax.annotate(f'KGE: {kge:.2f}', xy=(0.05, 0.65), xycoords='axes fraction')

In [None]:
# train the model with the best hyperparameters on all the data and save it
best_params = test_results.loc[test_results.MSE.idxmin(), "parameters"]
l1_ratio = best_params["l1_ratio"]
alpha = best_params["alpha"]

X = ml_input_data[features]
y = ml_input_data[y_col]

model = ElasticNet(l1_ratio=l1_ratio, random_state=seed, alpha=alpha)
model.fit(X, y)

dump(model, f"{model_name}_model.joblib")