In [18]:
# Install required libraries (if needed)
!pip install pandas scikit-learn openpyxl matplotlib

# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the Excel file into a DataFrame
data = pd.read_excel('./data/Dataset.xlsx', sheet_name='Scenario3')

# Clean column names by stripping any extra spaces
data.columns = data.columns.str.strip()

# Print column names to check
print(data.columns)

# Select relevant columns for prediction
selected_columns = data[['INFLOW', 'RESERVOIR STORAGE', 'EVAPORATION', 'RESERVOIR LEVEL', 'RELEASE (OUTFLOW)']]

# Show the first few rows of the data to understand its structure
print(data.head())


# Features (X) and target variable (y)
X = data[['INFLOW', 'EVAPORATION', 'RESERVOIR LEVEL', 'RESERVOIR STORAGE', 'RELEASE (OUTFLOW)']]  # independent variables
y = data['RELEASE (OUTFLOW)']  # dependent variable (target)

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_model.predict(X_test)

# Evaluate the model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


# Optionally: you can check feature importance to see which features contributed most to predictions
feature_importances = pd.DataFrame(rf_model.feature_importances_,
                                   index=X.columns,
                                   columns=["importance"]).sort_values("importance", ascending=False)

print(feature_importances)




[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Index(['Year', 'Month', 'INFLOW', 'RESERVOIR STORAGE', 'EVAPORATION',
       'RESERVOIR LEVEL', 'RELEASE (OUTFLOW)'],
      dtype='object')
     Year     Month   INFLOW  RESERVOIR STORAGE  EVAPORATION  RESERVOIR LEVEL  \
0  1989.0   January   613.89               2.53          5.1           484.22   
1     NaN  February   640.20               1.84         -9.2           477.17   
2     NaN     March   785.00               0.79        -19.8           462.44   
3     NaN     April  1005.60               0.45        -19.5           455.66   
4     NaN       May  1256.70               0.67        -20.4           460.13   

   RELEASE (OUTFLOW)  
0              583.6  
1              589.3  
2              730.2  
3             1338.8  
4      