#### Building Baseline Models
Baseline models serve as a reference point for evaluating the performance of more complex models. They provide a simple, interpretable way to measure improvements and assess whether an advanced model is actually performing better than a naive approach. We will start with Linear Regression, then progress to Random Forest and then use Gradient Boosting.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
root = '/content/drive/MyDrive/SnowPackPredictionChallenge'

In [None]:
# parse swe_data
df = pd.read_csv(root + '/feature_engineered_data.csv')
df

In [None]:
# Extract features and target variable
features = ["Latitude", "Longitude", "Elevation", "Southness",
    "precip", "tmin", "tmax", "SPH", "SRAD", "Rmax", "Rmin", "windspeed",
    "SWE_lag1", "SWE_lag3", "SWE_lag7",
    "precip_lag1", "tmin_lag1", "tmax_lag1", "SPH_lag1",
    "SRAD_lag1", "Rmax_lag1", "Rmin_lag1", "windspeed_lag1",
    "SWE_roll3", "SWE_roll7", "precip_roll3", "tmin_roll3"]
target = "SWE"

In [None]:
# Splitting data into training, validation, and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

X_train, y_train = train_df[features], train_df[target]
X_val, y_val = val_df[features], val_df[target]
X_test, y_test = test_df[features], test_df[target]

In [None]:
# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": HistGradientBoostingRegressor(random_state=42)
}

In [None]:
model = LinearRegression()

In [None]:
# Train and evaluate models with validated data
model.fit(X_train, y_train)
y_pred_lr = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred_lr))
r2 = r2_score(y_val, y_pred_lr)
model_results = {"RMSE": rmse, "R2": r2}
model_results

#### Model Evaluation - Compute NSE, RMSE, RÂ² Score, and Relative Bias

In [None]:
# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred_lr))

# Compute RÂ² Score
r2 = r2_score(y_test, y_pred_lr)

# Compute Relative Bias (%)
relative_bias = (np.sum(y_pred_lr - y_test) / np.sum(y_test)) * 100

# Compute Actual Error (Prediction - Observed)
actual_error = y_pred_lr - y_test

# Compute NSE (Nash-Sutcliffe Efficiency)
observed_mean = np.mean(y_test)
nse = 1 - (np.sum((y_pred_lr - y_test) ** 2) / np.sum((y_test - observed_mean) ** 2))

# Create a results DataFrame
evaluation_results = pd.DataFrame({
    "Metric": ["Nash-Sutcliffe Efficiency (NSE)", "Root Mean Square Error (RMSE)", "RÂ² Score", "Relative Bias (%)", 'Prediction Error'],
    "Value": [nse, rmse, r2, relative_bias, actual_error]
})

# Display evaluation metrics
print("\nðŸ“Š Model Evaluation Metrics:")
print(evaluation_results)


In [None]:
predictions_df = pd.DataFrame({
        "Date": test_df["Date"],
        "Latitude": test_df["Latitude"],
        "Longitude": test_df["Longitude"],
        "SWE_actual": y_test,
        "SWE_predicted": y_pred_lr
    })
predictions_df.to_csv("predictions.csv", index=False)

evaluation_results.append({
        "Model": "Linear Regression",
        "RMSE": rmse,
        "Actual Error": actual_error,
        "RÂ² Score": r2,
        "NSE": nse,
        "Relative Bias (%)": relative_bias
    })
evaluation_df.to_csv("evaluation.csv", index=False)