In [None]:
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import r2_score, mean_absolute_error,mean_squared_error
import pandas as pd
from datetime import date, timedelta
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math


In [None]:
fix_idio = 693963
def convert_to_datetime(days):
    start = date(1900,1,1)  
    delta = timedelta(days)
    offset = start + delta
    return offset


In [None]:
baseflow_df = pd.read_csv("RRCA_baseflow.csv")

In [None]:
baseflow_df["Date"] = baseflow_df["Date"] - fix_idio 
baseflow_df["Segment_id"] = baseflow_df["Segment_id"].astype(str)
baseflow_df["DateTime"] = baseflow_df["Date"].apply(lambda days: convert_to_datetime(days))
baseflow_df["month"] = baseflow_df['DateTime'].apply(lambda x: x.strftime("%m")).astype(int)
baseflow_df["year"] = baseflow_df['DateTime'].apply(lambda x: x.strftime("%Y")).astype(int)
baseflow_df["month_shifted"] =  abs( baseflow_df["month"]-7)
baseflow_df["days_since_rec_start"] = (6-baseflow_df["month_shifted"]) * 30


In [None]:
baseflow_df.head()

In [None]:
# baseflow_df.corr()
# baseflow_df.describe()

In [None]:
sns.lineplot(data=baseflow_df, x="days_since_rec_start", y="Evapotranspiration")
plt.title("Evapotranspiration over Baseflow Recession")

plt.figure()
sns.lineplot(data=baseflow_df, x="days_since_rec_start", y="Precipitation")
plt.title("Precipitation over Baseflow Recession")

plt.figure()
sns.lineplot(data=baseflow_df, x="days_since_rec_start", y="Irrigation_pumping")
plt.title("Irrigation_pumping over Baseflow Recession")

### baseflow equation mess

There is an equation used in hydrology to calculate the baseflow rate X days after the start of the recession period starts. I thought this might be a way to estimate it by:

- calculating the slope of decline during this recession period for each segment
- using KNN to find the closest segments based on the data provided
- use the average of the closest segment's slopes
- calculate the samples days since the start of the recession (based on month)
- plug things into equation and hope it estimates okay

(this has no regressio model in it but I still thought it was kind cool)

((it does not peform very well though))

In [None]:
# just based it off of more current data as that is what I read about. 5 years is arbitrary though. not sure what the standard amount is.
current_data = baseflow_df[baseflow_df["year"] >= 1995].copy()
current_data.head()

In [None]:
sns.lineplot(data=current_data, x="days_since_rec_start", y="Observed", hue="Segment_id")
plt.figure()
sns.regplot(data=current_data, x="month_shifted", y="Observed")


In [None]:
monthly_mean = current_data.groupby(["month_shifted", "Segment_id"]).agg({"Observed":"mean", "days_since_rec_start":"min"}).reset_index()

equation = monthly_mean.groupby(["Segment_id"]).agg(max_q=("Observed","max"), min_q=("Observed","min")).reset_index()
equation["division"] = abs(equation["min_q"]/equation["max_q"])
equation["neg_at"] = equation["division"].apply(lambda x: -(1/180)*math.log(x))

equation

In [None]:
columns = ["Date","Evapotranspiration", "Precipitation", "y", "x", "Irrigation_pumping", "days_since_rec_start"]

# Sample 5 rows
sample_index = current_data.sample(30).index
sample = current_data.loc[sample_index, columns]
y_true = current_data.loc[sample_index, "Observed"]

# Drop the sampled rows from X
X = current_data.drop(index=sample_index)
y = X["Observed"]
X = X.drop(columns=["Observed"])
X = X[columns]

# Initialize and fit NearestNeighbors model
nbrs = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(X)

y_preds = []

# Find nearest neighbors of the sample
for idx in sample_index:

    distances, indices = nbrs.kneighbors(sample.loc[[idx]])
    
    # Display the nearest neighbors and the sample
#     print("Nearest Neighbors:")
#     display(removed_outliers.iloc[indices[0]])
#     print("\nSample:")
#     display(sample)

    nn = current_data.iloc[indices[0]]["Segment_id"].values
    new_neg_at = 0
    new_q0 = 0
    for sid in nn:
        neg_at = equation[equation["Segment_id"] == sid]["neg_at"].values[0]
        new_neg_at += neg_at

        new_q0 += equation[equation["Segment_id"] == sid]["max_q"].values[0]

    # Extract the days since start for sample
    d = sample["days_since_rec_start"].values[0]
    
    # average flow slope
    neg_at = new_neg_at/len(nn)
    
    # new baseflow estimate 
    new_flow = new_q0*math.exp(-neg_at * d)

    y_preds.append(new_flow)

# print(y_true.values)
# print(y_preds)


In [None]:
print("MAE: ", mean_absolute_error(y_true.values, y_preds))
print("MSE: ", mean_squared_error(y_true.values, y_preds, squared=False))
print("R2: ", r2_score(y_true.values, y_preds))
sns.scatterplot(x=y_true.values, y=y_preds)