# Ohrid Water Demand - Model Experiments

Comprehensive model experimentation and comparison for water demand prediction.

## Models Evaluated
- Traditional Time Series: ARIMA, SARIMA, ETS
- Machine Learning: Random Forest, XGBoost, LightGBM
- Deep Learning: LSTM, Neural Networks
- Hybrid Ensemble: Combined approach

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import sys
sys.path.append('../src')

from models.ohrid_predictor import OhridWaterDemandPredictor

print("Model Experiments for Ohrid Water Demand Prediction")
print("=" * 60)

## 1. Load Engineered Features

In [None]:
# Load feature-engineered dataset
df = pd.read_csv('../data/features/ohrid_features_complete.csv', index_col=0, parse_dates=True)

print(f"Dataset shape: {df.shape}")
print(f"Features available: {df.shape[1]}")
print(f"Target variable: water_demand_m3_per_hour")

# Check for missing values
missing_data = df.isnull().sum()
print(f"\nMissing values: {missing_data.sum()} total")

## 2. Data Preparation

In [None]:
# Initialize predictor
predictor = OhridWaterDemandPredictor()

# Prepare data for modeling
X_train, X_val, X_test, y_train, y_val, y_test, features = predictor.prepare_data_for_modeling(df)

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")
print(f"Features used: {len(features)}")

# Display target variable statistics
print(f"\nTarget variable statistics:")
print(f"Mean: {y_train.mean():.2f} m³/hour")
print(f"Std: {y_train.std():.2f} m³/hour")
print(f"Range: {y_train.min():.2f} - {y_train.max():.2f} m³/hour")

## 3. Traditional Time Series Models

In [None]:
print("Training Traditional Time Series Models...")
print("-" * 50)

# ARIMA models
arima_models = predictor.fit_arima_models(y_train)
print(f"ARIMA models trained: {len(arima_models)}")

# Exponential Smoothing
ets_models = predictor.fit_exponential_smoothing(y_train)
print(f"ETS models trained: {len(ets_models)}")

# Store all time series models
ts_models = {**arima_models, **ets_models}
print(f"Total time series models: {len(ts_models)}")

## 4. Machine Learning Models

In [None]:
print("Training Machine Learning Models...")
print("-" * 50)

# Train ML models
ml_models = predictor.fit_machine_learning_models(X_train, y_train, X_val, y_val)
print(f"ML models trained: {len(ml_models)}")

# Display feature importance for tree-based models
if 'RandomForest' in predictor.feature_importance:
    rf_importance = predictor.feature_importance['RandomForest']
    top_features = sorted(rf_importance.items(), key=lambda x: x[1], reverse=True)[:10]
    
    print("\nTop 10 features (Random Forest):")
    for feature, importance in top_features:
        print(f"  {feature}: {importance:.4f}")

## 5. Deep Learning Models

In [None]:
print("Training Deep Learning Models...")
print("-" * 50)

# Train deep learning models
dl_models = predictor.fit_deep_learning_models(X_train, y_train, X_val, y_val)
print(f"Deep learning models trained: {len(dl_models)}")

# Combine all models
all_models = {**ts_models, **ml_models, **dl_models}
print(f"\nTotal models trained: {len(all_models)}")

## 6. Hybrid Ensemble Model

In [None]:
print("Creating Hybrid Ensemble Model...")
print("-" * 50)

# Create ensemble
ensemble_models = predictor.create_hybrid_ensemble(X_train, y_train)
print(f"Ensemble models created: {len(ensemble_models)}")

# Update model collection
all_models.update(ensemble_models)
print(f"Final model count: {len(all_models)}")

## 7. Model Evaluation

In [None]:
print("Evaluating All Models...")
print("-" * 50)

# Evaluate models
results = predictor.evaluate_models(X_test, y_test)

# Create results DataFrame
results_df = pd.DataFrame(results).T
results_df = results_df.round(4)

# Sort by MAE (lower is better)
results_df_sorted = results_df.sort_values('MAE')

print("\nModel Performance Comparison:")
print(results_df_sorted)

# Identify best model
best_model = results_df_sorted.index[0]
best_mae = results_df_sorted.loc[best_model, 'MAE']
best_r2 = results_df_sorted.loc[best_model, 'R2']

print(f"\nBest Model: {best_model}")
print(f"Best MAE: {best_mae:.4f} m³/hour")
print(f"Best R²: {best_r2:.4f}")

## 8. Model Comparison Visualization

In [None]:
# Performance comparison plot
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# MAE comparison
axes[0,0].bar(range(len(results_df_sorted)), results_df_sorted['MAE'])
axes[0,0].set_title('Mean Absolute Error (MAE)')
axes[0,0].set_ylabel('MAE (m³/hour)')
axes[0,0].set_xticks(range(len(results_df_sorted)))
axes[0,0].set_xticklabels(results_df_sorted.index, rotation=45)

# RMSE comparison
axes[0,1].bar(range(len(results_df_sorted)), results_df_sorted['RMSE'])
axes[0,1].set_title('Root Mean Square Error (RMSE)')
axes[0,1].set_ylabel('RMSE (m³/hour)')
axes[0,1].set_xticks(range(len(results_df_sorted)))
axes[0,1].set_xticklabels(results_df_sorted.index, rotation=45)

# R² comparison
axes[1,0].bar(range(len(results_df_sorted)), results_df_sorted['R2'])
axes[1,0].set_title('R² Score')
axes[1,0].set_ylabel('R² Score')
axes[1,0].set_xticks(range(len(results_df_sorted)))
axes[1,0].set_xticklabels(results_df_sorted.index, rotation=45)

# MAPE comparison
axes[1,1].bar(range(len(results_df_sorted)), results_df_sorted['MAPE'])
axes[1,1].set_title('Mean Absolute Percentage Error (MAPE)')
axes[1,1].set_ylabel('MAPE (%)')
axes[1,1].set_xticks(range(len(results_df_sorted)))
axes[1,1].set_xticklabels(results_df_sorted.index, rotation=45)

plt.tight_layout()
plt.savefig('../results/model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## 9. Save Experiment Results

In [None]:
# Save results
results_df.to_csv('../results/model_experiment_results.csv')
print("Results saved to: ../results/model_experiment_results.csv")

# Create experiment summary
experiment_summary = {
    'experiment_date': pd.Timestamp.now(),
    'dataset_size': len(df),
    'features_count': len(features),
    'models_tested': len(all_models),
    'best_model': best_model,
    'best_mae': best_mae,
    'best_r2': best_r2,
    'data_source': 'synthetic'
}

summary_df = pd.DataFrame([experiment_summary])
summary_df.to_csv('../results/experiment_summary.csv', index=False)
print("Summary saved to: ../results/experiment_summary.csv")

print(f"\nExperiment completed successfully!")
print(f"Best performing model: {best_model}")
print(f"Ready for production deployment.")