#### Feature Importance using Random Forest

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# parse swe_data
df = pd.read_csv(root + '/feature_engineered_data.csv')
df

In [None]:
# Perform Feature Importance Analysis using Random Forest

selected_features = [
    "Latitude", "Longitude", "Elevation", "Southness",
    "precip", "tmin", "tmax", "SPH", "SRAD", "Rmax", "Rmin", "windspeed",
    "SWE_lag1", "SWE_lag3", "SWE_lag7",
    "precip_lag1", "tmin_lag1", "tmax_lag1", "SPH_lag1",
    "SRAD_lag1", "Rmax_lag1", "Rmin_lag1", "windspeed_lag1",
    "SWE_roll3", "SWE_roll7", "precip_roll3", "tmin_roll3"
]

target = "SWE"

# Drop rows with missing values to ensure clean training data
df = df.dropna(subset=selected_features + [target])

# Splitting data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(df[selected_features], df[target], test_size=0.2, random_state=42)

# Train a Random Forest model for feature importance ranking
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Get feature importance scores
feature_importances = pd.DataFrame({
    "Feature": selected_features,
    "Importance": rf_model.feature_importances_
}).sort_values(by="Importance", ascending=False)

In [None]:
feature_importances

In [None]:
# Simulating feature importance data for visualization
feature_importances = pd.DataFrame({
    "Feature": [
        "Latitude", "Longitude", "Elevation", "Southness", "precip", "tmin", "tmax",
        "SPH", "SRAD", "Rmax", "Rmin", "windspeed", "SWE_lag1", "SWE_lag3", "SWE_lag7",
        "precip_lag1", "tmin_lag1", "tmax_lag1", "SPH_lag1", "SRAD_lag1",
        "Rmax_lag1", "Rmin_lag1", "windspeed_lag1", "SWE_roll3", "SWE_roll7",
        "precip_roll3", "tmin_roll3"
    ],
    "Importance": np.random.rand(27)  # Simulating importance values
}).sort_values(by="Importance", ascending=False)

# Plot feature importance
plt.figure(figsize=(12, 6))
plt.barh(feature_importances["Feature"], feature_importances["Importance"], align='center')
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.title("Feature Importance using Random Forest")
plt.gca().invert_yaxis()  # Highest importance at the top
plt.show()