In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
root = '/content/drive/MyDrive/SnowPackPredictionChallenge'

In [4]:
# parse swe_data
df = pd.read_csv(root + '/feature_engineered_data.csv')
df

Unnamed: 0,Station,Latitude,Longitude,Elevation,Southness,Date,SWE,nearest_md_latitude,nearest_md_longitude,precip,...,Rmin_roll3,Rmin_roll7,windspeed_roll3,windspeed_roll7,temp_range,snowfall,humidity_diff,day_of_year,humidity_temp_interaction,wind_humidity_interaction
0,Hannagan Meadows,33.65352,-109.30877,9027,0.888152,1991-01-08,279.40,33.65625,-109.28125,0.00,...,45.120000,43.608571,2.963333,4.830000,17.02,0.00,58.98,8,425.8356,155.1174
1,Hannagan Meadows,33.65352,-109.30877,9027,0.888152,1991-01-09,279.40,33.65625,-109.28125,0.35,...,37.123333,40.572857,3.130000,4.901429,15.22,0.00,70.67,9,446.6344,291.8671
2,Hannagan Meadows,33.65352,-109.30877,9027,0.888152,1991-01-10,281.94,33.65625,-109.28125,0.35,...,33.226667,40.642857,3.630000,4.955714,15.22,0.00,70.67,10,446.6344,291.8671
3,Hannagan Meadows,33.65352,-109.30877,9027,0.888152,1991-01-11,281.94,33.65625,-109.28125,0.00,...,28.913333,38.191429,4.096667,4.278571,17.62,0.00,71.92,11,403.4712,289.8376
4,Hannagan Meadows,33.65352,-109.30877,9027,0.888152,1991-01-12,281.94,33.65625,-109.28125,0.00,...,26.910000,35.060000,3.933333,3.545714,19.28,0.00,55.43,12,505.5216,201.7652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1897995,Garver Creek,48.97523,-115.81915,4250,-0.927766,2016-12-27,73.66,48.96875,-115.84375,0.00,...,55.280000,45.525714,2.180000,2.528571,7.84,0.00,21.00,362,-85.4700,45.7800
1897996,Garver Creek,48.97523,-115.81915,4250,-0.927766,2016-12-28,81.28,48.96875,-115.84375,0.00,...,55.280000,47.964286,2.180000,2.441429,7.84,0.00,21.00,363,-85.4700,45.7800
1897997,Garver Creek,48.97523,-115.81915,4250,-0.927766,2016-12-29,83.82,48.96875,-115.84375,3.70,...,55.673333,50.571429,2.523333,2.501429,9.42,3.70,30.36,364,-17.9124,97.4556
1897998,Garver Creek,48.97523,-115.81915,4250,-0.927766,2016-12-30,83.82,48.96875,-115.84375,3.70,...,56.066667,53.178571,2.866667,2.561429,9.42,3.70,30.36,365,-17.9124,97.4556


In [5]:
# Extract features and target variable
features = ["Latitude", "Longitude", "Elevation", "Southness",
    "precip", "tmin", "tmax", "SPH", "SRAD", "Rmax", "Rmin", "windspeed",
    "SWE_lag1", "SWE_lag3", "SWE_lag7",
    "precip_lag1", "tmin_lag1", "tmax_lag1", "SPH_lag1",
    "SRAD_lag1", "Rmax_lag1", "Rmin_lag1", "windspeed_lag1",
    "SWE_roll3", "SWE_roll7", "precip_roll3", "tmin_roll3"]
target = "SWE"

In [7]:
# Perform Feature Importance Analysis using Random Forest

selected_features = [
    "Latitude", "Longitude", "Elevation", "Southness",
    "precip", "tmin", "tmax", "SPH", "SRAD", "Rmax", "Rmin", "windspeed",
    "SWE_lag1", "SWE_lag3", "SWE_lag7",
    "precip_lag1", "tmin_lag1", "tmax_lag1", "SPH_lag1",
    "SRAD_lag1", "Rmax_lag1", "Rmin_lag1", "windspeed_lag1",
    "SWE_roll3", "SWE_roll7", "precip_roll3", "tmin_roll3"
]

target = "SWE"

# Drop rows with missing values to ensure clean training data
df = df.dropna(subset=selected_features + [target])

# Splitting data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(df[selected_features], df[target], test_size=0.2, random_state=42)

# Train a Random Forest model for feature importance ranking
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Get feature importance scores
feature_importances = pd.DataFrame({
    "Feature": selected_features,
    "Importance": rf_model.feature_importances_
}).sort_values(by="Importance", ascending=False)

# Display feature importance results
# import ace_tools as tools
# tools.display_dataframe_to_user(name="Feature Importance Rankings", dataframe=feature_importances)

KeyboardInterrupt: 

In [8]:
feature_importances

Unnamed: 0,Feature,Importance
23,SWE_roll3,0.99899
13,SWE_lag3,0.00059
12,SWE_lag1,0.000241
14,SWE_lag7,1.5e-05
24,SWE_roll7,1.3e-05
6,tmax,1.1e-05
19,SRAD_lag1,1e-05
8,SRAD,9e-06
4,precip,9e-06
15,precip_lag1,9e-06


In [6]:
# Splitting data into training, validation, and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

X_train, y_train = train_df[features], train_df[target]
X_val, y_val = val_df[features], val_df[target]
X_test, y_test = test_df[features], test_df[target]

In [7]:
# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": HistGradientBoostingRegressor(random_state=42)
}

In [8]:
# Train and evaluate models
model_results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2 = r2_score(y_val, y_pred)
    model_results[name] = {"RMSE": rmse, "R2": r2}

# Display model results
# import ace_tools as tools
# model_results_df = pd.DataFrame(model_results).T
# tools.display_dataframe_to_user(name="Model Selection Results", dataframe=model_results_df)

In [9]:
model_results

{'Linear Regression': {'RMSE': 3.5136256384049993, 'R2': 0.9998104371788472},
 'Random Forest': {'RMSE': 3.4737683835999738, 'R2': 0.9998147134457677},
 'Gradient Boosting': {'RMSE': 13.11325243286824, 'R2': 0.9973596406878305}}

**Implementing LSTM for the Time Series Data**

In [10]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Drop missing values
df = df.dropna(subset=features + [target])

# Normalize the features using StandardScaler
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

# Splitting data into training and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

X_train, y_train = train_df[features].values, train_df[target].values
X_val, y_val = val_df[features].values, val_df[target].values
X_test, y_test = test_df[features].values, test_df[target].values

# Reshape input for LSTM: (samples, time steps, features)
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_val = X_val.reshape((X_val.shape[0], 1, X_val.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Build the LSTM model
model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(1, X_train.shape[2])),
    Dropout(0.2),
    LSTM(32, return_sequences=False),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dense(1)  # Output layer for SWE prediction
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32,
                    validation_data=(X_val, y_val), verbose=1)

# Evaluate the model on test data
test_loss, test_mae = model.evaluate(X_test, y_test, verbose=0)
print(f"\n✅ Advanced LSTM Model - Test MAE: {test_mae:.4f}")

# Make predictions
y_pred = model.predict(X_test).flatten()

# Compute evaluation metrics
from sklearn.metrics import mean_squared_error, r2_score
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"\n📊 Advanced LSTM Model Performance:")
print(f"Root Mean Square Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")


  super().__init__(**kwargs)


Epoch 1/50
[1m37960/37960[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 4ms/step - loss: 9040.6035 - mae: 26.1305 - val_loss: 83.1135 - val_mae: 4.9374
Epoch 2/50
[1m37960/37960[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 4ms/step - loss: 618.8207 - mae: 11.6092 - val_loss: 76.1882 - val_mae: 4.3896
Epoch 3/50
[1m37960/37960[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m207s[0m 4ms/step - loss: 479.7762 - mae: 10.5127 - val_loss: 94.8718 - val_mae: 5.0437
Epoch 4/50
[1m37960/37960[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 4ms/step - loss: 376.8792 - mae: 9.3343 - val_loss: 72.5596 - val_mae: 3.7001
Epoch 5/50
[1m37960/37960[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 4ms/step - loss: 319.9100 - mae: 8.7464 - val_loss: 95.7505 - val_mae: 4.3054
Epoch 6/50
[1m37960/37960[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 4ms/step - loss: 285.3298 - mae: 8.3380 - val_loss: 51.4117 - val_mae: 3.6729
Epoch 7/50
[1m37960/37960[0m

## **Model Evaluation**

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

# Placeholder: Load test set predictions (Update with actual file path)
test_results_path = "/path/to/test_predictions.csv"  # Update this path
test_results = pd.read_csv(test_results_path)

# Ensure required columns exist
required_columns = ["Date", "Latitude", "Longitude", "SWE_actual", "SWE_predicted"]
if not all(col in test_results.columns for col in required_columns):
    raise ValueError("Test results file must contain: 'Date', 'Latitude', 'Longitude', 'SWE_actual', 'SWE_predicted'.")

# Extract actual and predicted SWE values
y_test = test_results["SWE_actual"].values
y_pred = test_results["SWE_predicted"].values

# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Compute R² Score
r2 = r2_score(y_test, y_pred)

# Compute Relative Bias (%)
relative_bias = (np.sum(y_pred - y_test) / np.sum(y_test)) * 100

# Compute Actual Error (Prediction - Observed)
actual_error = y_pred - y_test

# Compute NSE (Nash-Sutcliffe Efficiency)
observed_mean = np.mean(y_test)
nse = 1 - (np.sum((y_pred - y_test) ** 2) / np.sum((y_test - observed_mean) ** 2))

# Create a results DataFrame
evaluation_results = pd.DataFrame({
    "Metric": ["Nash-Sutcliffe Efficiency (NSE)", "Root Mean Square Error (RMSE)", "R² Score", "Relative Bias (%)"],
    "Value": [nse, rmse, r2, relative_bias]
})

# Display evaluation metrics
print("\n📊 Model Evaluation Metrics:")
print(evaluation_results)

# Add actual error column to test results for further analysis
test_results["Prediction Error"] = actual_error

# Display test results with errors
print("\n📊 Test Predictions with Errors:")
print(test_results.head())  # Display first few rows for verification
