In [None]:
# ============================
# 1. Import Necessary Libraries
# ============================

import pandas as pd
import numpy as np
import random
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib  # For saving models
from tqdm import tqdm  # For progress bars

# ============================
# 2. Set Random Seed
# ============================

def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)

set_seed(42)

# ============================
# 3. Load and Inspect Data
# ============================

# Read the dataset
data_path = 'path_to_your_data.csv'  # Replace with your actual file path
data_df = pd.read_csv(data_path)

# Display basic information about the dataset
print("Dataset Information:")
print(data_df.info())

# Display the first few rows of the dataset
print("\nFirst 5 Rows of the Dataset:")
print(data_df.head())

# ============================
# 4. Data Preprocessing
# ============================

# Assume the last column is the target variable (adjust if necessary)
feature_cols = data_df.columns[:-1]
target_col = data_df.columns[-1]

X = data_df[feature_cols].values
y = data_df[target_col].values

# Check for missing values
if np.isnan(X).any() or np.isnan(y).any():
    print("Missing values detected. Performing imputation...")
    # Simple imputation: Replace NaNs with the mean of each column
    X = np.nan_to_num(X, nan=np.nanmean(X))
    y = np.nan_to_num(y, nan=np.nanmean(y))

# Feature Scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

print("\nData Preprocessing Completed.")

# ============================
# 5. Define Model and Hyperparameter Search Space
# ============================

# Define the hyperparameter search space
search_space = {
    'model': ['LinearRegression', 'Ridge', 'Lasso', 'ElasticNet'],
    'alpha': [0.1, 1.0, 10.0],  # Applicable to Ridge, Lasso, ElasticNet
    'l1_ratio': [0.2, 0.5, 0.8],  # Applicable to ElasticNet
}

# Generate hyperparameter combinations
param_grid = []
for model in search_space['model']:
    if model == 'LinearRegression':
        param_grid.append({'model': [model]})
    elif model in ['Ridge', 'Lasso']:
        param_grid.append({'model': [model], 'alpha': search_space['alpha']})
    elif model == 'ElasticNet':
        param_grid.append({'model': [model], 'alpha': search_space['alpha'], 'l1_ratio': search_space['l1_ratio']})

# ============================
# 6. Five-Fold Cross-Validation and Model Training
# ============================

# Initialize K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize list to store results for each hyperparameter combination
all_results = []

print("Starting five-fold cross-validation and hyperparameter search...\n")

# Iterate over each hyperparameter combination
for params in tqdm(param_grid, desc="Model Combinations"):
    # Initialize metrics storage for current hyperparameter combination
    fold_metrics = {
        'train_mae': [],
        'train_mse': [],
        'train_rmse': [],
        'train_r2': [],
        'val_mae': [],
        'val_mse': [],
        'val_rmse': [],
        'val_r2': []
    }
    
    # Iterate over each fold
    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        
        # Select and initialize the model based on hyperparameters
        if params['model'] == 'LinearRegression':
            model = LinearRegression()
        elif params['model'] == 'Ridge':
            model = Ridge(alpha=params['alpha'])
        elif params['model'] == 'Lasso':
            model = Lasso(alpha=params['alpha'])
        elif params['model'] == 'ElasticNet':
            model = ElasticNet(alpha=params['alpha'], l1_ratio=params['l1_ratio'])
        else:
            raise ValueError(f"Unsupported model: {params['model']}")
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Predictions on training set
        y_train_pred = model.predict(X_train)
        train_mae = mean_absolute_error(y_train, y_train_pred)
        train_mse = mean_squared_error(y_train, y_train_pred)
        train_rmse = np.sqrt(train_mse)
        train_r2 = r2_score(y_train, y_train_pred)
        
        # Predictions on validation set
        y_val_pred = model.predict(X_val)
        val_mae = mean_absolute_error(y_val, y_val_pred)
        val_mse = mean_squared_error(y_val, y_val_pred)
        val_rmse = np.sqrt(val_mse)
        val_r2 = r2_score(y_val, y_val_pred)
        
        # Store metrics for the current fold
        fold_metrics['train_mae'].append(train_mae)
        fold_metrics['train_mse'].append(train_mse)
        fold_metrics['train_rmse'].append(train_rmse)
        fold_metrics['train_r2'].append(train_r2)
        fold_metrics['val_mae'].append(val_mae)
        fold_metrics['val_mse'].append(val_mse)
        fold_metrics['val_rmse'].append(val_rmse)
        fold_metrics['val_r2'].append(val_r2)
    
    # Calculate average metrics across all folds for the current hyperparameter combination
    avg_metrics = {metric: np.mean(values) for metric, values in fold_metrics.items()}
    avg_metrics['params'] = params
    all_results.append(avg_metrics)

# ============================
# 7. Results Analysis and Saving
# ============================

# Convert all results to a DataFrame
results_df = pd.DataFrame(all_results)

# Sort the results based on validation R² in descending order and select the top 10
top_results = results_df.sort_values(by='val_r2', ascending=False).head(10)
print("\nTop 10 Hyperparameter Combinations Based on Average Validation R²:")
print(top_results[['params', 'val_r2']])

# Save all results to an Excel file for further analysis
results_df.to_excel('LR_Model_AllResults.xlsx', index=False)
print("\nAll hyperparameter search results have been saved to 'LR_Model_AllResults.xlsx'")

# ============================
# 8. Extract Best Hyperparameter Combination
# ============================

# Extract the hyperparameter combination with the highest validation R²
best_result = results_df.loc[results_df['val_r2'].idxmax()]
best_params = best_result['params']
print(f"\nBest Hyperparameter Combination: {best_params}")

# ============================
# 9. Train Final Model with Best Hyperparameters
# ============================

# Initialize and train the final model using the best hyperparameters on the entire dataset
if best_params['model'] == 'LinearRegression':
    final_model = LinearRegression()
elif best_params['model'] == 'Ridge':
    final_model = Ridge(alpha=best_params['alpha'])
elif best_params['model'] == 'Lasso':
    final_model = Lasso(alpha=best_params['alpha'])
elif best_params['model'] == 'ElasticNet':
    final_model = ElasticNet(alpha=best_params['alpha'], l1_ratio=best_params['l1_ratio'])
else:
    raise ValueError(f"Unsupported model: {best_params['model']}")

# Train the final model on the entire dataset
final_model.fit(X, y)

# Make predictions on the entire dataset
y_pred = final_model.predict(X)

# Calculate performance metrics
final_mae = mean_absolute_error(y, y_pred)
final_mse = mean_squared_error(y, y_pred)
final_rmse = np.sqrt(final_mse)
final_r2 = r2_score(y, y_pred)

print("\nFinal Model Performance on the Entire Dataset:")
print(f"MAE: {final_mae:.4f}")
print(f"MSE: {final_mse:.4f}")
print(f"RMSE: {final_rmse:.4f}")
print(f"R²: {final_r2:.4f}")

# ============================
# 10. Save the Best Model
# ============================

# Save the scaler and the final trained model using joblib
joblib.dump(scaler, 'LR_Scaler.pkl')
joblib.dump(final_model, 'LR_FinalModel.pkl')
print("\nScaler and final model have been saved.")

# ============================
# 11. Save Best Hyperparameters and Final Results
# ============================

# Save the best hyperparameters and final performance metrics to an Excel file
results_to_save = {
    'Best Params': [best_params],
    'Final MAE': [final_mae],
    'Final MSE': [final_mse],
    'Final RMSE': [final_rmse],
    'Final R²': [final_r2]
}

# Convert to DataFrame
results_summary = pd.DataFrame(results_to_save)

# Save to Excel
results_summary.to_excel('LR_Model_BestResults.xlsx', index=False)
print("\nBest model results have been saved to 'LR_Model_BestResults.xlsx'")


In [None]:
# ============================
# 1. Import Necessary Libraries
# ============================

import pandas as pd
import numpy as np
import random
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib  # For saving models
from tqdm import tqdm  # For progress bars

# ============================
# 2. Set Random Seed for Reproducibility
# ============================

def set_seed(seed):
    """
    Sets the random seed for NumPy and Python's random module to ensure reproducibility.
    """
    np.random.seed(seed)
    random.seed(seed)

set_seed(42)  # You can choose any seed value you prefer

# ============================
# 3. Load and Inspect Data
# ============================

# Define the path to your CSV data file
data_path = 'path_to_your_data.csv'  # <-- Replace with your actual file path

# Load the dataset into a Pandas DataFrame
try:
    data_df = pd.read_csv(data_path)
    print("Dataset loaded successfully.\n")
except FileNotFoundError:
    print(f"File not found at the specified path: {data_path}")
    exit()

# Display basic information about the dataset
print("Dataset Information:")
print(data_df.info())

# Display the first five rows of the dataset
print("\nFirst 5 Rows of the Dataset:")
print(data_df.head())

# ============================
# 4. Data Preprocessing
# ============================

# Assume the last column is the target variable (adjust if necessary)
feature_cols = data_df.columns[:-1]
target_col = data_df.columns[-1]

# Extract features and target variable as NumPy arrays
X = data_df[feature_cols].values
y = data_df[target_col].values

# Check for missing values in features and target
if np.isnan(X).any() or np.isnan(y).any():
    print("\nMissing values detected. Performing imputation...")
    # Replace NaNs with the mean of each column
    X = np.nan_to_num(X, nan=np.nanmean(X))
    y = np.nan_to_num(y, nan=np.nanmean(y))
    print("Missing values imputed with column means.")
else:
    print("\nNo missing values detected.")

# ============================
# 5. Define Pipeline and Hyperparameter Grid
# ============================

# Create a pipeline that first scales the data and then applies SGDRegressor
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('sgdregressor', SGDRegressor(
        max_iter=1000, 
        tol=1e-3, 
        learning_rate='constant',  # To use eta0 effectively
        random_state=42
    ))
])

# Define the hyperparameter grid for GridSearchCV
param_grid = {
    'sgdregressor__eta0': [0.01, 0.05, 0.1, 0.5],          # Learning rates
    'sgdregressor__momentum': [0.1, 0.3, 0.5, 0.7, 0.9]  # Momentum values
}

# ============================
# 6. Five-Fold Cross-Validation and Grid Search
# ============================

# Initialize K-Fold cross-validation with 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize GridSearchCV with the pipeline and hyperparameter grid
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='r2',          # Using R² as the evaluation metric
    cv=kf,
    n_jobs=-1,             # Utilize all available CPU cores
    verbose=2              # Verbosity level: 0, 1, or 2
)

print("Starting five-fold cross-validation and grid search...\n")

# Fit GridSearchCV to the data
grid_search.fit(X, y)

print("\nGrid search completed.")

# ============================
# 7. Results Analysis and Saving
# ============================

# Extract all grid search results into a DataFrame
results_df = pd.DataFrame(grid_search.cv_results_)

# Select relevant columns for clarity
selected_columns = [
    'param_sgdregressor__eta0',
    'param_sgdregressor__momentum',
    'mean_test_score',
    'std_test_score',
    'mean_train_score',
    'std_train_score'
]
results_selected = results_df[selected_columns]

# Rename columns for better readability
results_selected = results_selected.rename(columns={
    'param_sgdregressor__eta0': 'eta0',
    'param_sgdregressor__momentum': 'momentum',
    'mean_test_score': 'mean_val_r2',
    'std_test_score': 'std_val_r2',
    'mean_train_score': 'mean_train_r2',
    'std_train_score': 'std_train_r2'
})

# Sort the results by mean validation R² in descending order and select top 10
top_results = results_selected.sort_values(by='mean_val_r2', ascending=False).head(10)
print("\nTop 10 Hyperparameter Combinations Based on Average Validation R²:")
print(top_results)

# Save all grid search results to an Excel file for further analysis
results_selected.to_excel('SGD_Model_AllResults.xlsx', index=False)
print("\nAll hyperparameter search results have been saved to 'SGD_Model_AllResults.xlsx'.")

# ============================
# 8. Extract Best Hyperparameter Combination
# ============================

# Get the best hyperparameters
best_params = grid_search.best_params_
print(f"\nBest Hyperparameter Combination:")
print(best_params)

# Get the best validation R² score
best_score = grid_search.best_score_
print(f"Best Validation R² Score: {best_score:.4f}")

# ============================
# 9. Train Final Model with Best Hyperparameters
# ============================

# Initialize the final model with the best hyperparameters
final_pipeline = grid_search.best_estimator_

# Train the final model on the entire dataset
final_pipeline.fit(X, y)

# Make predictions on the entire dataset
y_pred = final_pipeline.predict(X)

# Calculate performance metrics
final_mae = mean_absolute_error(y, y_pred)
final_mse = mean_squared_error(y, y_pred)
final_rmse = np.sqrt(final_mse)
final_r2 = r2_score(y, y_pred)

print("\nFinal Model Performance on the Entire Dataset:")
print(f"MAE: {final_mae:.4f}")
print(f"MSE: {final_mse:.4f}")
print(f"RMSE: {final_rmse:.4f}")
print(f"R²: {final_r2:.4f}")

# ============================
# 10. Save the Best Model and Scaler
# ============================

# Save the pipeline (which includes both scaler and model) using joblib
model_filename = 'SGD_FinalModel_Pipeline.pkl'
joblib.dump(final_pipeline, model_filename)
print(f"\nFinal model pipeline (including scaler) has been saved as '{model_filename}'.")

# ============================
# 11. Save Best Hyperparameters and Final Results
# ============================

# Create a dictionary with the best hyperparameters and final performance metrics
results_to_save = {
    'Best Params': [best_params],
    'Final MAE': [final_mae],
    'Final MSE': [final_mse],
    'Final RMSE': [final_rmse],
    'Final R²': [final_r2]
}

# Convert the dictionary to a DataFrame
results_summary = pd.DataFrame(results_to_save)

# Save the summary to an Excel file
summary_filename = 'SGD_Model_BestResults.xlsx'
results_summary.to_excel(summary_filename, index=False)
print(f"\nBest model results have been saved to '{summary_filename}'.")

# ============================
# 12. Conclusion
# ============================

print("\nScript execution completed successfully.")


In [None]:
# ============================
# 1. Import Necessary Libraries
# ============================

import pandas as pd
import numpy as np
import random
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib  # For saving models
from tqdm import tqdm  # For progress bars

# ============================
# 2. Set Random Seed for Reproducibility
# ============================

def set_seed(seed):
    """
    Sets the random seed for NumPy and Python's random module to ensure reproducibility.
    """
    np.random.seed(seed)
    random.seed(seed)

set_seed(42)  # You can choose any seed value you prefer

# ============================
# 3. Load and Inspect Data
# ============================

# Define the path to your CSV data file
data_path = 'path_to_your_data.csv'  # <-- Replace with your actual file path

# Load the dataset into a Pandas DataFrame
try:
    data_df = pd.read_csv(data_path)
    print("Dataset loaded successfully.\n")
except FileNotFoundError:
    print(f"File not found at the specified path: {data_path}")
    exit()

# Display basic information about the dataset
print("Dataset Information:")
print(data_df.info())

# Display the first five rows of the dataset
print("\nFirst 5 Rows of the Dataset:")
print(data_df.head())

# ============================
# 4. Data Preprocessing
# ============================

# Assume the last column is the target variable (adjust if necessary)
feature_cols = data_df.columns[:-1]
target_col = data_df.columns[-1]

# Extract features and target variable as NumPy arrays
X = data_df[feature_cols].values
y = data_df[target_col].values

# Check for missing values in features and target
if np.isnan(X).any() or np.isnan(y).any():
    print("\nMissing values detected. Performing imputation...")
    # Replace NaNs with the mean of each column
    X = np.nan_to_num(X, nan=np.nanmean(X))
    y = np.nan_to_num(y, nan=np.nanmean(y))
    print("Missing values imputed with column means.")
else:
    print("\nNo missing values detected.")

# ============================
# 5. Define Pipeline and Hyperparameter Grid
# ============================

# Create a pipeline that first scales the data and then applies RandomForestRegressor
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('randomforestregressor', RandomForestRegressor(random_state=42))
])

# Define the hyperparameter grid for GridSearchCV
param_grid = {
    'randomforestregressor__n_estimators': [100, 200, 300, 500],       # Number of trees
    'randomforestregressor__max_depth': [None, 10, 20, 30],           # Maximum depth of the tree
    'randomforestregressor__min_samples_split': [2, 5, 10],           # Minimum number of samples required to split an internal node
    'randomforestregressor__min_samples_leaf': [1, 2, 4],             # Minimum number of samples required to be at a leaf node
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2']    # Number of features to consider when looking for the best split
}

# ============================
# 6. Five-Fold Cross-Validation and Grid Search
# ============================

# Initialize K-Fold cross-validation with 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize GridSearchCV with the pipeline and hyperparameter grid
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='r2',          # Using R² as the evaluation metric
    cv=kf,
    n_jobs=-1,             # Utilize all available CPU cores
    verbose=2              # Verbosity level: 0, 1, or 2
)

print("Starting five-fold cross-validation and grid search...\n")

# Fit GridSearchCV to the data
grid_search.fit(X, y)

print("\nGrid search completed.")

# ============================
# 7. Results Analysis and Saving
# ============================

# Extract all grid search results into a DataFrame
results_df = pd.DataFrame(grid_search.cv_results_)

# Select relevant columns for clarity
selected_columns = [
    'param_randomforestregressor__n_estimators',
    'param_randomforestregressor__max_depth',
    'param_randomforestregressor__min_samples_split',
    'param_randomforestregressor__min_samples_leaf',
    'param_randomforestregressor__max_features',
    'mean_test_score',
    'std_test_score',
    'mean_train_score',
    'std_train_score'
]
results_selected = results_df[selected_columns]

# Rename columns for better readability
results_selected = results_selected.rename(columns={
    'param_randomforestregressor__n_estimators': 'n_estimators',
    'param_randomforestregressor__max_depth': 'max_depth',
    'param_randomforestregressor__min_samples_split': 'min_samples_split',
    'param_randomforestregressor__min_samples_leaf': 'min_samples_leaf',
    'param_randomforestregressor__max_features': 'max_features',
    'mean_test_score': 'mean_val_r2',
    'std_test_score': 'std_val_r2',
    'mean_train_score': 'mean_train_r2',
    'std_train_score': 'std_train_r2'
})

# Sort the results by mean validation R² in descending order and select top 10
top_results = results_selected.sort_values(by='mean_val_r2', ascending=False).head(10)
print("\nTop 10 Hyperparameter Combinations Based on Average Validation R²:")
print(top_results)

# Save all grid search results to an Excel file for further analysis
results_selected.to_excel('RF_Model_AllResults.xlsx', index=False)
print("\nAll hyperparameter search results have been saved to 'RF_Model_AllResults.xlsx'.")

# ============================
# 8. Extract Best Hyperparameter Combination
# ============================

# Get the best hyperparameters
best_params = grid_search.best_params_
print(f"\nBest Hyperparameter Combination:")
print(best_params)

# Get the best validation R² score
best_score = grid_search.best_score_
print(f"Best Validation R² Score: {best_score:.4f}")

# ============================
# 9. Train Final Model with Best Hyperparameters
# ============================

# Initialize the final model with the best hyperparameters
final_pipeline = grid_search.best_estimator_

# Train the final model on the entire dataset
final_pipeline.fit(X, y)

# Make predictions on the entire dataset
y_pred = final_pipeline.predict(X)

# Calculate performance metrics
final_mae = mean_absolute_error(y, y_pred)
final_mse = mean_squared_error(y, y_pred)
final_rmse = np.sqrt(final_mse)
final_r2 = r2_score(y, y_pred)

print("\nFinal Model Performance on the Entire Dataset:")
print(f"MAE: {final_mae:.4f}")
print(f"MSE: {final_mse:.4f}")
print(f"RMSE: {final_rmse:.4f}")
print(f"R²: {final_r2:.4f}")

# ============================
# 10. Save the Best Model and Scaler
# ============================

# Save the pipeline (which includes both scaler and model) using joblib
model_filename = 'RF_FinalModel_Pipeline.pkl'
joblib.dump(final_pipeline, model_filename)
print(f"\nFinal model pipeline (including scaler) has been saved as '{model_filename}'.")

# ============================
# 11. Save Best Hyperparameters and Final Results
# ============================

# Create a dictionary with the best hyperparameters and final performance metrics
results_to_save = {
    'Best Params': [best_params],
    'Final MAE': [final_mae],
    'Final MSE': [final_mse],
    'Final RMSE': [final_rmse],
    'Final R²': [final_r2]
}

# Convert the dictionary to a DataFrame
results_summary = pd.DataFrame(results_to_save)

# Save the summary to an Excel file
summary_filename = 'RF_Model_BestResults.xlsx'
results_summary.to_excel(summary_filename, index=False)
print(f"\nBest model results have been saved to '{summary_filename}'.")

# ============================
# 12. Conclusion
# ============================

print("\nScript execution completed successfully.")


In [None]:
# ============================
# 1. Import Necessary Libraries
# ============================

import pandas as pd
import numpy as np
import random
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib  # For saving models
from tqdm import tqdm  # For progress bars

# Import LightGBM's LGBMRegressor
try:
    from lightgbm import LGBMRegressor
except ImportError:
    print("LightGBM is not installed. Installing now...")
    import subprocess
    subprocess.check_call(["pip", "install", "lightgbm"])
    from lightgbm import LGBMRegressor

# ============================
# 2. Set Random Seed for Reproducibility
# ============================

def set_seed(seed):
    """
    Sets the random seed for NumPy and Python's random module to ensure reproducibility.
    """
    np.random.seed(seed)
    random.seed(seed)

set_seed(42)  # You can choose any seed value you prefer

# ============================
# 3. Load and Inspect Data
# ============================

# Define the path to your CSV data file
data_path = 'path_to_your_data.csv'  # <-- Replace with your actual file path

# Load the dataset into a Pandas DataFrame
try:
    data_df = pd.read_csv(data_path)
    print("Dataset loaded successfully.\n")
except FileNotFoundError:
    print(f"File not found at the specified path: {data_path}")
    exit()

# Display basic information about the dataset
print("Dataset Information:")
print(data_df.info())

# Display the first five rows of the dataset
print("\nFirst 5 Rows of the Dataset:")
print(data_df.head())

# ============================
# 4. Data Preprocessing
# ============================

# Assume the last column is the target variable (adjust if necessary)
feature_cols = data_df.columns[:-1]
target_col = data_df.columns[-1]

# Extract features and target variable as NumPy arrays
X = data_df[feature_cols].values
y = data_df[target_col].values

# Check for missing values in features and target
if np.isnan(X).any() or np.isnan(y).any():
    print("\nMissing values detected. Performing imputation...")
    # Replace NaNs with the mean of each column
    X = np.nan_to_num(X, nan=np.nanmean(X))
    y = np.nan_to_num(y, nan=np.nanmean(y))
    print("Missing values imputed with column means.")
else:
    print("\nNo missing values detected.")

# ============================
# 5. Define Pipeline and Hyperparameter Grid
# ============================

# Create a pipeline that first scales the data and then applies LGBMRegressor
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lgbmregressor', LGBMRegressor(random_state=42))
])

# Define the hyperparameter grid for GridSearchCV
param_grid = {
    'lgbmregressor__num_leaves': [31, 64, 128, 256],          # Number of leaves in full trees
    'lgbmregressor__max_depth': [10, 20, 30, 40],            # Maximum depth of the tree
    'lgbmregressor__n_estimators': [100, 200, 300],          # Number of boosting iterations
    'lgbmregressor__learning_rate': [0.01, 0.1, 0.5],        # Boosting learning rate
    'lgbmregressor__subsample': [0.6, 0.8, 1.0],             # Fraction of samples to be used for fitting the individual base learners
    'lgbmregressor__colsample_bytree': ['auto', 'sqrt', 'log2'],  # Fraction of features to be used for fitting the individual base learners
    'lgbmregressor__reg_alpha': [0.0, 0.1, 0.5, 1.0],        # L1 regularization term on weights
    'lgbmregressor__reg_lambda': [0.0, 0.1, 0.5, 1.0]        # L2 regularization term on weights
}

# ============================
# 6. Five-Fold Cross-Validation and Grid Search
# ============================

# Initialize K-Fold cross-validation with 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize GridSearchCV with the pipeline and hyperparameter grid
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='r2',          # Using R² as the evaluation metric
    cv=kf,
    n_jobs=-1,             # Utilize all available CPU cores
    verbose=2              # Verbosity level: 0, 1, or 2
)

print("Starting five-fold cross-validation and grid search...\n")

# Fit GridSearchCV to the data
grid_search.fit(X, y)

print("\nGrid search completed.")

# ============================
# 7. Results Analysis and Saving
# ============================

# Extract all grid search results into a DataFrame
results_df = pd.DataFrame(grid_search.cv_results_)

# Select relevant columns for clarity
selected_columns = [
    'param_lgbmregressor__num_leaves',
    'param_lgbmregressor__max_depth',
    'param_lgbmregressor__n_estimators',
    'param_lgbmregressor__learning_rate',
    'param_lgbmregressor__subsample',
    'param_lgbmregressor__colsample_bytree',
    'param_lgbmregressor__reg_alpha',
    'param_lgbmregressor__reg_lambda',
    'mean_test_score',
    'std_test_score',
    'mean_train_score',
    'std_train_score'
]
results_selected = results_df[selected_columns]

# Rename columns for better readability
results_selected = results_selected.rename(columns={
    'param_lgbmregressor__num_leaves': 'num_leaves',
    'param_lgbmregressor__max_depth': 'max_depth',
    'param_lgbmregressor__n_estimators': 'n_estimators',
    'param_lgbmregressor__learning_rate': 'learning_rate',
    'param_lgbmregressor__subsample': 'subsample',
    'param_lgbmregressor__colsample_bytree': 'colsample_bytree',
    'param_lgbmregressor__reg_alpha': 'reg_alpha',
    'param_lgbmregressor__reg_lambda': 'reg_lambda',
    'mean_test_score': 'mean_val_r2',
    'std_test_score': 'std_val_r2',
    'mean_train_score': 'mean_train_r2',
    'std_train_score': 'std_train_r2'
})

# Sort the results by mean validation R² in descending order and select top 10
top_results = results_selected.sort_values(by='mean_val_r2', ascending=False).head(10)
print("\nTop 10 Hyperparameter Combinations Based on Average Validation R²:")
print(top_results)

# Save all grid search results to an Excel file for further analysis
results_selected.to_excel('LGBM_Model_AllResults.xlsx', index=False)
print("\nAll hyperparameter search results have been saved to 'LGBM_Model_AllResults.xlsx'.")

# ============================
# 8. Extract Best Hyperparameter Combination
# ============================

# Get the best hyperparameters
best_params = grid_search.best_params_
print(f"\nBest Hyperparameter Combination:")
print(best_params)

# Get the best validation R² score
best_score = grid_search.best_score_
print(f"Best Validation R² Score: {best_score:.4f}")

# ============================
# 9. Train Final Model with Best Hyperparameters
# ============================

# Initialize the final model with the best hyperparameters
final_pipeline = grid_search.best_estimator_

# Train the final model on the entire dataset
final_pipeline.fit(X, y)

# Make predictions on the entire dataset
y_pred = final_pipeline.predict(X)

# Calculate performance metrics
final_mae = mean_absolute_error(y, y_pred)
final_mse = mean_squared_error(y, y_pred)
final_rmse = np.sqrt(final_mse)
final_r2 = r2_score(y, y_pred)

print("\nFinal Model Performance on the Entire Dataset:")
print(f"MAE: {final_mae:.4f}")
print(f"MSE: {final_mse:.4f}")
print(f"RMSE: {final_rmse:.4f}")
print(f"R²: {final_r2:.4f}")

# ============================
# 10. Save the Best Model and Scaler
# ============================

# Save the pipeline (which includes both scaler and model) using joblib
model_filename = 'LGBM_FinalModel_Pipeline.pkl'
joblib.dump(final_pipeline, model_filename)
print(f"\nFinal model pipeline (including scaler) has been saved as '{model_filename}'.")

# ============================
# 11. Save Best Hyperparameters and Final Results
# ============================

# Create a dictionary with the best hyperparameters and final performance metrics
results_to_save = {
    'Best Params': [best_params],
    'Final MAE': [final_mae],
    'Final MSE': [final_mse],
    'Final RMSE': [final_rmse],
    'Final R²': [final_r2]
}

# Convert the dictionary to a DataFrame
results_summary = pd.DataFrame(results_to_save)

# Save the summary to an Excel file
summary_filename = 'LGBM_Model_BestResults.xlsx'
results_summary.to_excel(summary_filename, index=False)
print(f"\nBest model results have been saved to '{summary_filename}'.")

# ============================
# 12. Conclusion
# ============================

print("\nScript execution completed successfully.")


In [None]:
# ============================
# 1. Import Necessary Libraries
# ============================

import pandas as pd
import numpy as np
import random
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib  # For saving models
from tqdm import tqdm  # For progress bars

# ============================
# 2. Set Random Seed for Reproducibility
# ============================

def set_seed(seed):
    """
    Sets the random seed for NumPy and Python's random module to ensure reproducibility.
    """
    np.random.seed(seed)
    random.seed(seed)

set_seed(42)  # You can choose any seed value you prefer

# ============================
# 3. Load and Inspect Data
# ============================

# Define the path to your CSV data file
data_path = 'path_to_your_data.csv'  # <-- Replace with your actual file path

# Load the dataset into a Pandas DataFrame
try:
    data_df = pd.read_csv(data_path)
    print("Dataset loaded successfully.\n")
except FileNotFoundError:
    print(f"File not found at the specified path: {data_path}")
    exit()

# Display basic information about the dataset
print("Dataset Information:")
print(data_df.info())

# Display the first five rows of the dataset
print("\nFirst 5 Rows of the Dataset:")
print(data_df.head())

# ============================
# 4. Data Preprocessing
# ============================

# Assume the last column is the target variable (adjust if necessary)
feature_cols = data_df.columns[:-1]
target_col = data_df.columns[-1]

# Extract features and target variable as NumPy arrays
X = data_df[feature_cols].values
y = data_df[target_col].values

# Check for missing values in features and target
if np.isnan(X).any() or np.isnan(y).any():
    print("\nMissing values detected. Performing imputation...")
    # Replace NaNs with the mean of each column
    X = np.nan_to_num(X, nan=np.nanmean(X))
    y = np.nan_to_num(y, nan=np.nanmean(y))
    print("Missing values imputed with column means.")
else:
    print("\nNo missing values detected.")

# ============================
# 5. Define Pipeline and Hyperparameter Grid
# ============================

# Create a pipeline that first scales the data and then applies MLPRegressor
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('mlpregressor', MLPRegressor(random_state=42, max_iter=1000))
])

# Define the hyperparameter grid for GridSearchCV
# Since some hyperparameters are only applicable to certain solvers,
# we define separate parameter grids for each solver
param_grid = [
    {
        'mlpregressor__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100), (50, 100, 50)],
        'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'mlpregressor__solver': ['lbfgs'],
        'mlpregressor__alpha': [0.0001, 0.001, 0.01, 0.1],
        'mlpregressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
        'mlpregressor__learning_rate_init': [0.001, 0.01, 0.1],
        'mlpregressor__max_iter': [200, 300, 500],
        'mlpregressor__batch_size': [32, 64, 128, 256],
        'mlpregressor__early_stopping': [True, False],
        'mlpregressor__validation_fraction': [0.1, 0.2, 0.3]
        # Note: 'momentum', 'beta_1', 'beta_2', 'epsilon' are not applicable to 'lbfgs'
    },
    {
        'mlpregressor__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100), (50, 100, 50)],
        'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'mlpregressor__solver': ['sgd'],
        'mlpregressor__alpha': [0.0001, 0.001, 0.01, 0.1],
        'mlpregressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
        'mlpregressor__learning_rate_init': [0.001, 0.01, 0.1],
        'mlpregressor__max_iter': [200, 300, 500],
        'mlpregressor__batch_size': [32, 64, 128, 256],
        'mlpregressor__momentum': [0.0, 0.5, 0.9],
        'mlpregressor__early_stopping': [True, False],
        'mlpregressor__validation_fraction': [0.1, 0.2, 0.3]
        # 'beta_1', 'beta_2', 'epsilon' are not applicable to 'sgd'
    },
    {
        'mlpregressor__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100), (50, 100, 50)],
        'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'mlpregressor__solver': ['adam'],
        'mlpregressor__alpha': [0.0001, 0.001, 0.01, 0.1],
        'mlpregressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
        'mlpregressor__learning_rate_init': [0.001, 0.01, 0.1],
        'mlpregressor__max_iter': [200, 300, 500],
        'mlpregressor__batch_size': [32, 64, 128, 256],
        'mlpregressor__beta_1': [0.9, 0.95, 0.99],
        'mlpregressor__beta_2': [0.999, 0.995, 0.99],
        'mlpregressor__epsilon': [1e-8, 1e-7, 1e-6],
        'mlpregressor__early_stopping': [True, False],
        'mlpregressor__validation_fraction': [0.1, 0.2, 0.3]
        # 'momentum' is not applicable to 'adam'
    }
]

# ============================
# 6. Five-Fold Cross-Validation and Grid Search
# ============================

# Initialize K-Fold cross-validation with 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize GridSearchCV with the pipeline and hyperparameter grid
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='r2',          # Using R² as the evaluation metric
    cv=kf,
    n_jobs=-1,             # Utilize all available CPU cores
    verbose=2              # Verbosity level: 0, 1, or 2
)

print("Starting five-fold cross-validation and grid search...\n")

# Fit GridSearchCV to the data
grid_search.fit(X, y)

print("\nGrid search completed.")

# ============================
# 7. Results Analysis and Saving
# ============================

# Extract all grid search results into a DataFrame
results_df = pd.DataFrame(grid_search.cv_results_)

# Select relevant columns for clarity
selected_columns = [
    'param_mlpregressor__hidden_layer_sizes',
    'param_mlpregressor__activation',
    'param_mlpregressor__solver',
    'param_mlpregressor__alpha',
    'param_mlpregressor__learning_rate',
    'param_mlpregressor__learning_rate_init',
    'param_mlpregressor__max_iter',
    'param_mlpregressor__batch_size',
    'param_mlpregressor__momentum',
    'param_mlpregressor__beta_1',
    'param_mlpregressor__beta_2',
    'param_mlpregressor__epsilon',
    'param_mlpregressor__early_stopping',
    'param_mlpregressor__validation_fraction',
    'mean_test_score',
    'std_test_score',
    'mean_train_score',
    'std_train_score'
]
results_selected = results_df[selected_columns]

# Rename columns for better readability
results_selected = results_selected.rename(columns={
    'param_mlpregressor__hidden_layer_sizes': 'hidden_layer_sizes',
    'param_mlpregressor__activation': 'activation',
    'param_mlpregressor__solver': 'solver',
    'param_mlpregressor__alpha': 'alpha',
    'param_mlpregressor__learning_rate': 'learning_rate',
    'param_mlpregressor__learning_rate_init': 'learning_rate_init',
    'param_mlpregressor__max_iter': 'max_iter',
    'param_mlpregressor__batch_size': 'batch_size',
    'param_mlpregressor__momentum': 'momentum',
    'param_mlpregressor__beta_1': 'beta_1',
    'param_mlpregressor__beta_2': 'beta_2',
    'param_mlpregressor__epsilon': 'epsilon',
    'param_mlpregressor__early_stopping': 'early_stopping',
    'param_mlpregressor__validation_fraction': 'validation_fraction',
    'mean_test_score': 'mean_val_r2',
    'std_test_score': 'std_val_r2',
    'mean_train_score': 'mean_train_r2',
    'std_train_score': 'std_train_r2'
})

# Sort the results by mean validation R² in descending order and select top 10
top_results = results_selected.sort_values(by='mean_val_r2', ascending=False).head(10)
print("\nTop 10 Hyperparameter Combinations Based on Average Validation R²:")
print(top_results)

# Save all grid search results to an Excel file for further analysis
results_selected.to_excel('ANN_Model_AllResults.xlsx', index=False)
print("\nAll hyperparameter search results have been saved to 'ANN_Model_AllResults.xlsx'.")

# ============================
# 8. Extract Best Hyperparameter Combination
# ============================

# Get the best hyperparameters
best_params = grid_search.best_params_
print(f"\nBest Hyperparameter Combination:")
print(best_params)

# Get the best validation R² score
best_score = grid_search.best_score_
print(f"Best Validation R² Score: {best_score:.4f}")

# ============================
# 9. Train Final Model with Best Hyperparameters
# ============================

# Initialize the final model with the best hyperparameters
final_pipeline = grid_search.best_estimator_

# Train the final model on the entire dataset
final_pipeline.fit(X, y)

# Make predictions on the entire dataset
y_pred = final_pipeline.predict(X)

# Calculate performance metrics
final_mae = mean_absolute_error(y, y_pred)
final_mse = mean_squared_error(y, y_pred)
final_rmse = np.sqrt(final_mse)
final_r2 = r2_score(y, y_pred)

print("\nFinal Model Performance on the Entire Dataset:")
print(f"MAE: {final_mae:.4f}")
print(f"MSE: {final_mse:.4f}")
print(f"RMSE: {final_rmse:.4f}")
print(f"R²: {final_r2:.4f}")

# ============================
# 10. Save the Best Model and Scaler
# ============================

# Save the pipeline (which includes both scaler and model) using joblib
model_filename = 'ANN_FinalModel_Pipeline.pkl'
joblib.dump(final_pipeline, model_filename)
print(f"\nFinal model pipeline (including scaler) has been saved as '{model_filename}'.")

# ============================
# 11. Save Best Hyperparameters and Final Results
# ============================

# Create a dictionary with the best hyperparameters and final performance metrics
results_to_save = {
    'Best Params': [best_params],
    'Final MAE': [final_mae],
    'Final MSE': [final_mse],
    'Final RMSE': [final_rmse],
    'Final R²': [final_r2]
}

# Convert the dictionary to a DataFrame
results_summary = pd.DataFrame(results_to_save)

# Save the summary to an Excel file
summary_filename = 'ANN_Model_BestResults.xlsx'
results_summary.to_excel(summary_filename, index=False)
print(f"\nBest model results have been saved to '{summary_filename}'.")

# ============================
# 12. Conclusion
# ============================

print("\nScript execution completed successfully.")


In [None]:
# ============================
# 1. Import Necessary Libraries
# ============================

import pandas as pd
import numpy as np
import random
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator, TransformerMixin
import joblib  # For saving models
from sklearn.neural_network import BernoulliRBM  # RBM implementation
from sklearn.exceptions import NotFittedError

# ============================
# 2. Set Random Seed for Reproducibility
# ============================

def set_seed(seed):
    """
    Sets the random seed for NumPy and Python's random module to ensure reproducibility.
    """
    np.random.seed(seed)
    random.seed(seed)

set_seed(42)  # You can choose any seed value you prefer

# ============================
# 3. Load and Inspect Data
# ============================

# Define the path to your CSV data file
data_path = 'path_to_your_data.csv'  # <-- Replace with your actual file path

# Load the dataset into a Pandas DataFrame
try:
    data_df = pd.read_csv(data_path)
    print("Dataset loaded successfully.\n")
except FileNotFoundError:
    print(f"File not found at the specified path: {data_path}")
    exit()

# Display basic information about the dataset
print("Dataset Information:")
print(data_df.info())

# Display the first five rows of the dataset
print("\nFirst 5 Rows of the Dataset:")
print(data_df.head())

# ============================
# 4. Data Preprocessing
# ============================

# Assume the last column is the target variable (adjust if necessary)
feature_cols = data_df.columns[:-1]
target_col = data_df.columns[-1]

# Extract features and target variable as NumPy arrays
X = data_df[feature_cols].values
y = data_df[target_col].values

# Check for missing values in features and target
if np.isnan(X).any() or np.isnan(y).any():
    print("\nMissing values detected. Performing imputation...")
    # Replace NaNs with the mean of each column
    X = np.nan_to_num(X, nan=np.nanmean(X))
    y = np.nan_to_num(y, nan=np.nanmean(y))
    print("Missing values imputed with column means.")
else:
    print("\nNo missing values detected.")

# ============================
# 5. Define Pipeline and Hyperparameter Grid
# ============================

# Create a pipeline that first scales the data, applies RBM, and then uses MLPRegressor
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rbm', BernoulliRBM(random_state=42)),
    ('mlpregressor', MLPRegressor(random_state=42, max_iter=1000))
])

# Define the hyperparameter grid for GridSearchCV
# Note: 'momentum', 'regularization', and 'init_weight' are NOT applicable to BernoulliRBM in scikit-learn
param_grid = {
    'rbm__n_components': [64, 128, 256, 512],
    'rbm__learning_rate': [0.001, 0.01, 0.1],
    'rbm__batch_size': [10, 50, 100, 200],
    'rbm__n_iter': [10, 50, 100, 200],
    'rbm__verbose': [0, 1],
    # 'rbm__momentum': [0.0, 0.5, 0.9],  # Not applicable
    # 'rbm__regularization': [0.0, 0.0001, 0.001, 0.01],  # Not applicable
    # 'rbm__init_weight': [0.01, 0.1, 0.5],  # Not applicable
    # You can add more RBM parameters if needed
    
    # Optionally, you can also tune MLPRegressor hyperparameters
    'mlpregressor__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100), (50, 100, 50)],
    'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
    'mlpregressor__solver': ['lbfgs', 'sgd', 'adam'],
    'mlpregressor__alpha': [0.0001, 0.001, 0.01, 0.1],
    'mlpregressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
    'mlpregressor__learning_rate_init': [0.001, 0.01, 0.1],
    'mlpregressor__max_iter': [200, 300, 500],
    'mlpregressor__batch_size': [32, 64, 128, 256],
    'mlpregressor__momentum': [0.0, 0.5, 0.9],
    'mlpregressor__early_stopping': [True, False],
    'mlpregressor__validation_fraction': [0.1, 0.2, 0.3],
    'mlpregressor__beta_1': [0.9, 0.95, 0.99],
    'mlpregressor__beta_2': [0.999, 0.995, 0.99],
    'mlpregressor__epsilon': [1e-8, 1e-7, 1e-6]
}

# ============================
# 6. Five-Fold Cross-Validation and Grid Search
# ============================

# Initialize K-Fold cross-validation with 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize GridSearchCV with the pipeline and hyperparameter grid
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='r2',          # Using R² as the evaluation metric
    cv=kf,
    n_jobs=-1,             # Utilize all available CPU cores
    verbose=2              # Verbosity level: 0, 1, or 2
)

print("Starting five-fold cross-validation and grid search...\n")

# Fit GridSearchCV to the data
grid_search.fit(X, y)

print("\nGrid search completed.")

# ============================
# 7. Results Analysis and Saving
# ============================

# Extract all grid search results into a DataFrame
results_df = pd.DataFrame(grid_search.cv_results_)

# Select relevant columns for clarity
selected_columns = [
    'param_rbm__n_components',
    'param_rbm__learning_rate',
    'param_rbm__batch_size',
    'param_rbm__n_iter',
    'param_rbm__verbose',
    'param_mlpregressor__hidden_layer_sizes',
    'param_mlpregressor__activation',
    'param_mlpregressor__solver',
    'param_mlpregressor__alpha',
    'param_mlpregressor__learning_rate',
    'param_mlpregressor__learning_rate_init',
    'param_mlpregressor__max_iter',
    'param_mlpregressor__batch_size',
    'param_mlpregressor__momentum',
    'param_mlpregressor__early_stopping',
    'param_mlpregressor__validation_fraction',
    'param_mlpregressor__beta_1',
    'param_mlpregressor__beta_2',
    'param_mlpregressor__epsilon',
    'mean_test_score',
    'std_test_score',
    'mean_train_score',
    'std_train_score'
]
results_selected = results_df[selected_columns]

# Rename columns for better readability
results_selected = results_selected.rename(columns={
    'param_rbm__n_components': 'n_components',
    'param_rbm__learning_rate': 'rbm_learning_rate',
    'param_rbm__batch_size': 'rbm_batch_size',
    'param_rbm__n_iter': 'rbm_n_iter',
    'param_rbm__verbose': 'rbm_verbose',
    'param_mlpregressor__hidden_layer_sizes': 'hidden_layer_sizes',
    'param_mlpregressor__activation': 'activation',
    'param_mlpregressor__solver': 'solver',
    'param_mlpregressor__alpha': 'alpha',
    'param_mlpregressor__learning_rate': 'mlp_learning_rate',
    'param_mlpregressor__learning_rate_init': 'learning_rate_init',
    'param_mlpregressor__max_iter': 'mlp_max_iter',
    'param_mlpregressor__batch_size': 'mlp_batch_size',
    'param_mlpregressor__momentum': 'momentum',
    'param_mlpregressor__early_stopping': 'early_stopping',
    'param_mlpregressor__validation_fraction': 'validation_fraction',
    'param_mlpregressor__beta_1': 'beta_1',
    'param_mlpregressor__beta_2': 'beta_2',
    'param_mlpregressor__epsilon': 'epsilon',
    'mean_test_score': 'mean_val_r2',
    'std_test_score': 'std_val_r2',
    'mean_train_score': 'mean_train_r2',
    'std_train_score': 'std_train_r2'
})

# Sort the results by mean validation R² in descending order and select top 10
top_results = results_selected.sort_values(by='mean_val_r2', ascending=False).head(10)
print("\nTop 10 Hyperparameter Combinations Based on Average Validation R²:")
print(top_results)

# Save all grid search results to an Excel file for further analysis
results_selected.to_excel('ANN_Model_AllResults.xlsx', index=False)
print("\nAll hyperparameter search results have been saved to 'ANN_Model_AllResults.xlsx'.")

# ============================
# 8. Extract Best Hyperparameter Combination
# ============================

# Get the best hyperparameters
best_params = grid_search.best_params_
print(f"\nBest Hyperparameter Combination:")
print(best_params)

# Get the best validation R² score
best_score = grid_search.best_score_
print(f"Best Validation R² Score: {best_score:.4f}")

# ============================
# 9. Train Final Model with Best Hyperparameters
# ============================

# Initialize the final model with the best hyperparameters
final_pipeline = grid_search.best_estimator_

# Train the final model on the entire dataset
final_pipeline.fit(X, y)

# Make predictions on the entire dataset
y_pred = final_pipeline.predict(X)

# Calculate performance metrics
final_mae = mean_absolute_error(y, y_pred)
final_mse = mean_squared_error(y, y_pred)
final_rmse = np.sqrt(final_mse)
final_r2 = r2_score(y, y_pred)

print("\nFinal Model Performance on the Entire Dataset:")
print(f"MAE: {final_mae:.4f}")
print(f"MSE: {final_mse:.4f}")
print(f"RMSE: {final_rmse:.4f}")
print(f"R²: {final_r2:.4f}")

# ============================
# 10. Save the Best Model and Scaler
# ============================

# Save the pipeline (which includes both scaler, RBM, and MLPRegressor) using joblib
model_filename = 'ANN_FinalModel_Pipeline.pkl'
joblib.dump(final_pipeline, model_filename)
print(f"\nFinal model pipeline (including scaler and RBM) has been saved as '{model_filename}'.")

# ============================
# 11. Save Best Hyperparameters and Final Results
# ============================

# Create a dictionary with the best hyperparameters and final performance metrics
results_to_save = {
    'Best Params': [best_params],
    'Final MAE': [final_mae],
    'Final MSE': [final_mse],
    'Final RMSE': [final_rmse],
    'Final R²': [final_r2]
}

# Convert the dictionary to a DataFrame
results_summary = pd.DataFrame(results_to_save)

# Save the summary to an Excel file
summary_filename = 'ANN_Model_BestResults.xlsx'
results_summary.to_excel(summary_filename, index=False)
print(f"\nBest model results have been saved to '{summary_filename}'.")

# ============================
# 12. Conclusion
# ============================

print("\nScript execution completed successfully.")


In [None]:
import torch
import pandas as pd
import numpy as np
import random
from torch_geometric.nn import GATConv, global_mean_pool, global_sum_pool, global_max_pool
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold, ParameterSampler
from torch_geometric.data import Data, DataLoader
from tqdm import tqdm
from itertools import product  # 引入 itertools

# ============================
# 1. Setup and Configuration
# ============================

# Set random seeds for reproducibility
def set_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

set_seed(42)

# Define the hyperparameter search space
search_space = {
    'hidden_layer_sizes': [(64,), (128,), (64, 64), (128, 128), (64, 128, 64)],
    'learning_rate': [0.001, 0.01, 0.1],
    'num_epochs': [50, 100, 200],
    'batch_size': [16, 32, 64, 128],
    'dropout_rate': [0.0, 0.2, 0.5],
    'weight_decay': [0.0, 0.0001, 0.001, 0.01],
    'activation': ['relu', 'tanh', 'sigmoid'],
    'num_layers': [2, 3, 4],
    'aggregation_type': ['mean', 'sum', 'max'],
    'optimizer': ['adam', 'sgd', 'rmsprop'],
    'learning_rate_scheduler': ['constant', 'step', 'cosine']
}

# Define the number of random samples from the search space
n_iter = 100  # Adjust based on computational resources

# Generate random hyperparameter combinations
param_list = list(ParameterSampler(search_space, n_iter=n_iter, random_state=42))

# Number of folds for cross-validation
num_folds = 5

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# ============================
# 2. Data Loading and Processing
# ============================

# Read the single dataset file
data_path = 'path_to_your_data.csv'  # Replace with your actual file path
data_df = pd.read_csv(data_path).dropna()

# Convert DataFrame to tensor
data_tensor = torch.tensor(data_df.values, dtype=torch.float32)

# Define feature names and mapping (ensure these match your CSV columns)
features = ['ORP', 'V', 'DO', 'pH', 'SF', 'Spro', 'Sac', 'Sh', 'SSO4', 'SH2S', 'XS', 'SCH4']
node_mapping = {feat: idx for idx, feat in enumerate(features)}

# ============================
# 3. Generate Fully Connected Directed Edge Index
# ============================

# Generate all possible edges excluding self-connections for a fully connected directed graph
num_nodes = len(features)
edges = [(i, j) for i, j in product(range(num_nodes), repeat=2) if i != j]

# Convert to tensor format and transpose to [2, num_edges]
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()

# ============================
# 4. Model Definition
# ============================

class FlexibleGNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_layers, dropout_rate, activation, num_layers, aggregation_type):
        super(FlexibleGNN, self).__init__()
        self.num_layers = num_layers
        self.aggregation_type = aggregation_type
        self.activation = activation
        self.dropout_rate = dropout_rate

        # Activation function
        if activation == 'relu':
            self.activation_fn = torch.relu
        elif activation == 'tanh':
            self.activation_fn = torch.tanh
        elif activation == 'sigmoid':
            self.activation_fn = torch.sigmoid
        else:
            raise ValueError(f"Unsupported activation: {activation}")

        # Aggregation function
        if aggregation_type == 'mean':
            self.agg_fn = global_mean_pool
        elif aggregation_type == 'sum':
            self.agg_fn = global_sum_pool
        elif aggregation_type == 'max':
            self.agg_fn = global_max_pool
        else:
            raise ValueError(f"Unsupported aggregation type: {aggregation_type}")

        # Define GAT layers
        self.gat_layers = torch.nn.ModuleList()
        prev_dim = input_dim
        for i in range(num_layers):
            out_dim = hidden_layers[i] if i < len(hidden_layers) else hidden_layers[-1]
            heads = 8  # You can make this a hyperparameter if desired
            concat = True if i < num_layers - 1 else False  # Don't concatenate in the last layer
            self.gat_layers.append(GATConv(prev_dim, out_dim, heads=heads, concat=concat, dropout=dropout_rate))
            prev_dim = out_dim * heads if concat else out_dim

        # Define a fully connected layer for regression
        self.fc = torch.nn.Linear(prev_dim, 1)

        # Dropout layer
        self.dropout = torch.nn.Dropout(p=dropout_rate)

    def forward(self, x, edge_index, batch):
        for gat in self.gat_layers:
            x = gat(x, edge_index)
            x = self.activation_fn(x)
            x = self.dropout(x)
        x = self.agg_fn(x, batch)
        x = self.fc(x)
        return x

# ============================
# 5. Dataset Preparation
# ============================

class GraphDataset(torch.utils.data.Dataset):
    def __init__(self, data_tensor, edge_index, target_col=-1):
        super(GraphDataset, self).__init__()
        self.x = data_tensor[:, :-1]
        self.y = data_tensor[:, target_col].unsqueeze(1)
        self.edge_index = edge_index

    def __len__(self):
        return 1  # Single graph

    def __getitem__(self, idx):
        data = Data(x=self.x, edge_index=self.edge_index, y=self.y)
        return data

dataset = GraphDataset(data_tensor, edge_index)

# ============================
# 6. Evaluation Function
# ============================

def evaluate(model, loader, criterion, device):
    model.eval()
    preds = []
    targets = []
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            out = model(data.x, data.edge_index, data.batch)
            preds.append(out.cpu().numpy())
            targets.append(data.y.cpu().numpy())
    preds = np.vstack(preds)
    targets = np.vstack(targets)
    mae = mean_absolute_error(targets, preds)
    mse = mean_squared_error(targets, preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(targets, preds)
    return mae, mse, rmse, r2

# ============================
# 7. Cross-Validation and Hyperparameter Search
# ============================

# Initialize KFold
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Initialize variables to store the best results
best_result = None
best_val_r2 = -np.inf

# Initialize a list to store all results
all_results = []

print("Starting randomized search with five-fold cross-validation...\n")
for idx, params in enumerate(tqdm(param_list, desc="Hyperparameter combinations")):
    fold_metrics = {
        'train_mae': [],
        'train_mse': [],
        'train_rmse': [],
        'train_r2': [],
        'val_mae': [],
        'val_mse': [],
        'val_rmse': [],
        'val_r2': []
    }
    
    # Convert the entire dataset to a NumPy array for indexing
    X = data_tensor[:, :-1].numpy()
    y = data_tensor[:, -1].numpy()
    
    # Perform K-Fold cross-validation
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        # Split data
        train_x = torch.tensor(X[train_idx], dtype=torch.float32)
        train_y = torch.tensor(y[train_idx], dtype=torch.float32).unsqueeze(1)
        val_x = torch.tensor(X[val_idx], dtype=torch.float32)
        val_y = torch.tensor(y[val_idx], dtype=torch.float32).unsqueeze(1)
        
        # Create masks
        num_nodes = data_tensor.size(0)
        train_mask = torch.zeros(num_nodes, dtype=torch.bool)
        val_mask = torch.zeros(num_nodes, dtype=torch.bool)
        train_mask[train_idx] = True
        val_mask[val_idx] = True
        
        # Create a single Data object with masks
        data = Data(x=data_tensor[:, :-1], edge_index=edge_index, y=data_tensor[:, -1].unsqueeze(1))
        data.train_mask = train_mask
        data.val_mask = val_mask
        data = data.to(device)
        
        # Create DataLoader for the entire graph
        loader = DataLoader([data], batch_size=1, shuffle=False)
        
        # Initialize the model
        model = FlexibleGNN(
            input_dim=X.shape[1],
            hidden_layers=params['hidden_layer_sizes'],
            dropout_rate=params['dropout_rate'],
            activation=params['activation'],
            num_layers=params['num_layers'],
            aggregation_type=params['aggregation_type']
        ).to(device)
        
        # Define optimizer
        if params['optimizer'] == 'adam':
            optimizer = torch.optim.Adam(model.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])
        elif params['optimizer'] == 'sgd':
            optimizer = torch.optim.SGD(model.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])
        elif params['optimizer'] == 'rmsprop':
            optimizer = torch.optim.RMSprop(model.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])
        else:
            raise ValueError(f"Unsupported optimizer: {params['optimizer']}")
        
        # Define learning rate scheduler
        if params['learning_rate_scheduler'] == 'constant':
            scheduler = None
        elif params['learning_rate_scheduler'] == 'step':
            scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1)
        elif params['learning_rate_scheduler'] == 'cosine':
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=params['num_epochs'])
        else:
            raise ValueError(f"Unsupported learning rate scheduler: {params['learning_rate_scheduler']}")
        
        # Define loss function
        criterion = torch.nn.MSELoss()
        
        # Training loop
        model.train()
        for epoch in range(params['num_epochs']):
            optimizer.zero_grad()
            out = model(data.x, data.edge_index, torch.zeros(data.x.size(0), dtype=torch.long).to(device))  # Batch can be zeros since it's a single graph
            loss = criterion(out[data.train_mask], data.y[data.train_mask])
            loss.backward()
            optimizer.step()
            if scheduler:
                scheduler.step()
        
        # Evaluate on training fold
        train_preds = out[data.train_mask].detach().cpu().numpy()
        train_targets = data.y[data.train_mask].detach().cpu().numpy()
        train_mae = mean_absolute_error(train_targets, train_preds)
        train_mse = mean_squared_error(train_targets, train_preds)
        train_rmse = np.sqrt(train_mse)
        train_r2 = r2_score(train_targets, train_preds)
        fold_metrics['train_mae'].append(train_mae)
        fold_metrics['train_mse'].append(train_mse)
        fold_metrics['train_rmse'].append(train_rmse)
        fold_metrics['train_r2'].append(train_r2)
        
        # Evaluate on validation fold
        val_preds = out[data.val_mask].detach().cpu().numpy()
        val_targets = data.y[data.val_mask].detach().cpu().numpy()
        val_mae = mean_absolute_error(val_targets, val_preds)
        val_mse = mean_squared_error(val_targets, val_preds)
        val_rmse = np.sqrt(val_mse)
        val_r2 = r2_score(val_targets, val_preds)
        fold_metrics['val_mae'].append(val_mae)
        fold_metrics['val_mse'].append(val_mse)
        fold_metrics['val_rmse'].append(val_rmse)
        fold_metrics['val_r2'].append(val_r2)
    
    # Aggregate metrics across folds
    avg_metrics = {metric: np.mean(values) for metric, values in fold_metrics.items()}
    avg_metrics['params'] = params
    all_results.append(avg_metrics)
    
    # Update best result based on validation R2
    if avg_metrics['val_r2'] > best_val_r2:
        best_val_r2 = avg_metrics['val_r2']
        best_result = avg_metrics

# ============================
# 8. Results Analysis
# ============================

# Convert all results to DataFrame
results_df = pd.DataFrame(all_results)

# Display top 10 hyperparameter combinations based on validation R²
top_results = results_df.sort_values(by='val_r2', ascending=False).head(10)
print("\nTop hyperparameter combinations based on average validation R²:")
print(top_results[['params', 'val_r2']])

# Save all results to Excel for further analysis
results_df.to_excel('EcoGNNfull_Model_Model_AllResults.xlsx', index=False)
print("\nAll hyperparameter search results saved to 'EcoGNNfull_Model_Model_AllResults.xlsx'")

# Extract best hyperparameters
best_params = best_result['params']
print(f"\nBest Hyperparameters: {best_params}")

# ============================
# 9. Final Model Training (Optional)
# ============================

# Optionally, retrain the model on the entire dataset using the best hyperparameters
# Note: This step is optional and depends on whether you need a final model for deployment

# Initialize the final model
final_model = FlexibleGNN(
    input_dim=data_tensor.size(1) - 1,  # Number of features
    hidden_layers=best_params['hidden_layer_sizes'],
    dropout_rate=best_params['dropout_rate'],
    activation=best_params['activation'],
    num_layers=best_params['num_layers'],
    aggregation_type=best_params['aggregation_type']
).to(device)

# Define optimizer
if best_params['optimizer'] == 'adam':
    optimizer = torch.optim.Adam(final_model.parameters(), lr=best_params['learning_rate'], weight_decay=best_params['weight_decay'])
elif best_params['optimizer'] == 'sgd':
    optimizer = torch.optim.SGD(final_model.parameters(), lr=best_params['learning_rate'], weight_decay=best_params['weight_decay'])
elif best_params['optimizer'] == 'rmsprop':
    optimizer = torch.optim.RMSprop(final_model.parameters(), lr=best_params['learning_rate'], weight_decay=best_params['weight_decay'])
else:
    raise ValueError(f"Unsupported optimizer: {best_params['optimizer']}")

# Define learning rate scheduler
if best_params['learning_rate_scheduler'] == 'constant':
    scheduler = None
elif best_params['learning_rate_scheduler'] == 'step':
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1)
elif best_params['learning_rate_scheduler'] == 'cosine':
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=best_params['num_epochs'])
else:
    raise ValueError(f"Unsupported learning rate scheduler: {best_params['learning_rate_scheduler']}")

# Define loss function
criterion = torch.nn.MSELoss()

# Create a single Data object for the entire dataset
full_data = Data(x=data_tensor[:, :-1], edge_index=edge_index, y=data_tensor[:, -1].unsqueeze(1)).to(device)

# Create DataLoader
final_loader = DataLoader([full_data], batch_size=1, shuffle=True)

# Training loop for the final model
final_model.train()
for epoch in range(best_params['num_epochs']):
    optimizer.zero_grad()
    out = final_model(full_data.x, full_data.edge_index, torch.zeros(full_data.x.size(0), dtype=torch.long).to(device))  # Batch can be zeros since it's a single graph
    loss = criterion(out, full_data.y)
    loss.backward()
    optimizer.step()
    if scheduler:
        scheduler.step()
    if (epoch + 1) % 10 == 0 or epoch == 0:
        print(f"Epoch {epoch+1}/{best_params['num_epochs']}, Loss: {loss.item():.4f}")

# ============================
# 10. Final Evaluation
# ============================

# Evaluate the final model using cross-validation metrics
# Since we've already used cross-validation to select hyperparameters, we'll skip re-evaluating

# However, if you retrain the model on the entire dataset, you can compute metrics directly

final_model.eval()
with torch.no_grad():
    out = final_model(full_data.x, full_data.edge_index, torch.zeros(full_data.x.size(0), dtype=torch.long).to(device))
    preds = out.cpu().numpy()
    targets = full_data.y.cpu().numpy()
    final_mae = mean_absolute_error(targets, preds)
    final_mse = mean_squared_error(targets, preds)
    final_rmse = np.sqrt(final_mse)
    final_r2 = r2_score(targets, preds)

print("\nFinal Model Performance on Entire Dataset:")
print(f"MAE: {final_mae:.4f}, MSE: {final_mse:.4f}, RMSE: {final_rmse:.4f}, R²: {final_r2:.4f}")

# ============================
# 11. Save Best Hyperparameters and Results
# ============================

# Save the best hyperparameters and performance metrics
results_to_save = {
    'Best Params': [best_params],
    'Final MAE': [final_mae],
    'Final MSE': [final_mse],
    'Final RMSE': [final_rmse],
    'Final R²': [final_r2]
}

# Convert to DataFrame for saving
results_summary = pd.DataFrame(results_to_save)

# Save to Excel
results_summary.to_excel('GNNfull_Model_BestResults.xlsx', index=False)
print("\nBest model results saved to 'EcoGNNfull_Model_BestResults.xlsx'")


In [None]:
import torch
import pandas as pd
import numpy as np
import random
from torch_geometric.nn import GATConv, global_mean_pool, global_sum_pool, global_max_pool
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold, ParameterSampler
from torch_geometric.data import Data, DataLoader
from tqdm import tqdm

# ============================
# 1. Setup and Configuration
# ============================

# Set random seeds for reproducibility
def set_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

set_seed(42)

# Define the hyperparameter search space
search_space = {
    'hidden_layer_sizes': [(64,), (128,), (64, 64), (128, 128), (64, 128, 64)],
    'learning_rate': [0.001, 0.01, 0.1],
    'num_epochs': [50, 100, 200],
    'batch_size': [16, 32, 64, 128],
    'dropout_rate': [0.0, 0.2, 0.5],
    'weight_decay': [0.0, 0.0001, 0.001, 0.01],
    'activation': ['relu', 'tanh', 'sigmoid'],
    'num_layers': [2, 3, 4],
    'aggregation_type': ['mean', 'sum', 'max'],
    'optimizer': ['adam', 'sgd', 'rmsprop'],
    'learning_rate_scheduler': ['constant', 'step', 'cosine']
}

# Define the number of random samples from the search space
n_iter = 100  # Adjust based on computational resources

# Generate random hyperparameter combinations
param_list = list(ParameterSampler(search_space, n_iter=n_iter, random_state=42))

# Number of folds for cross-validation
num_folds = 5

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# ============================
# 2. Data Loading and Processing
# ============================

# Read the single dataset file
data_path = 'path_to_your_data.csv'  # Replace with your actual file path
data_df = pd.read_csv(data_path).dropna()

# Convert DataFrame to tensor
data_tensor = torch.tensor(data_df.values, dtype=torch.float32)

# Define feature names and mapping (ensure these match your CSV columns)
features = ['ORP', 'V', 'DO', 'pH', 'SF', 'Spro', 'Sac', 'Sh', 'SSO4', 'SH2S', 'XS', 'SCH4']
node_mapping = {feat: idx for idx, feat in enumerate(features)}

# Define relationships to keep (水质之间的链接)
relationships_keep = [
    ('SF', 'Spro'), ('SF', 'Sac'), ('Sac', 'SH2S'), ('SSO4', 'SH2S'), 
    ('Sh', 'SH2S'), ('XS', 'SF'), ('SH2S', 'SCH4'), ('Sac', 'SCH4'), 
    ('Sh', 'SCH4'), ('SF', 'Sh')
]

# Define environmental and water quality features
environmental_features = ['ORP', 'V', 'DO', 'pH']
water_quality_features = ['SF', 'Spro', 'Sac', 'Sh', 'SSO4', 'SH2S', 'XS', 'SCH4']

# Define the number of environmental-to-water-quality links to generate
num_env_wq_links = 23  # Same as the number of links to remove

# Generate all possible environmental-to-water-quality relationships
all_possible_env_wq = [(env, wq) for env in environmental_features for wq in water_quality_features]

# Remove any existing environmental-to-water-quality relationships if present
# Since in relationships_keep these are only water quality to water quality, no need to remove
# But ensure that in relationships_keep there are no environmental-to-water-quality links

# Randomly sample 23 unique environmental-to-water-quality links
random_env_wq = random.sample(all_possible_env_wq, num_env_wq_links)

# Combine the kept relationships and the new environmental-to-water-quality relationships
relationships = relationships_keep + random_env_wq

# Generate edge_index tensor
edge_index = torch.tensor([[node_mapping[src], node_mapping[dst]] for src, dst in relationships], dtype=torch.long).t().contiguous()

# ============================
# 3. Model Definition
# ============================

class FlexibleGNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_layers, dropout_rate, activation, num_layers, aggregation_type):
        super(FlexibleGNN, self).__init__()
        self.num_layers = num_layers
        self.aggregation_type = aggregation_type
        self.activation = activation
        self.dropout_rate = dropout_rate

        # Activation function
        if activation == 'relu':
            self.activation_fn = torch.relu
        elif activation == 'tanh':
            self.activation_fn = torch.tanh
        elif activation == 'sigmoid':
            self.activation_fn = torch.sigmoid
        else:
            raise ValueError(f"Unsupported activation: {activation}")

        # Aggregation function
        if aggregation_type == 'mean':
            self.agg_fn = global_mean_pool
        elif aggregation_type == 'sum':
            self.agg_fn = global_sum_pool
        elif aggregation_type == 'max':
            self.agg_fn = global_max_pool
        else:
            raise ValueError(f"Unsupported aggregation type: {aggregation_type}")

        # Define GAT layers
        self.gat_layers = torch.nn.ModuleList()
        prev_dim = input_dim
        for i in range(num_layers):
            out_dim = hidden_layers[i] if i < len(hidden_layers) else hidden_layers[-1]
            heads = 8  # You can make this a hyperparameter if desired
            concat = True if i < num_layers - 1 else False  # Don't concatenate in the last layer
            self.gat_layers.append(GATConv(prev_dim, out_dim, heads=heads, concat=concat, dropout=dropout_rate))
            prev_dim = out_dim * heads if concat else out_dim

        # Define a fully connected layer for regression
        self.fc = torch.nn.Linear(prev_dim, 1)

        # Dropout layer
        self.dropout = torch.nn.Dropout(p=dropout_rate)

    def forward(self, x, edge_index, batch):
        for gat in self.gat_layers:
            x = gat(x, edge_index)
            x = self.activation_fn(x)
            x = self.dropout(x)
        x = self.agg_fn(x, batch)
        x = self.fc(x)
        return x

# ============================
# 4. Dataset Preparation
# ============================

class GraphDataset(torch.utils.data.Dataset):
    def __init__(self, data_tensor, edge_index, target_col=-1):
        super(GraphDataset, self).__init__()
        self.x = data_tensor[:, :-1]
        self.y = data_tensor[:, target_col].unsqueeze(1)
        self.edge_index = edge_index

    def __len__(self):
        return 1  # Single graph

    def __getitem__(self, idx):
        data = Data(x=self.x, edge_index=self.edge_index, y=self.y)
        return data

dataset = GraphDataset(data_tensor, edge_index)

# ============================
# 5. Evaluation Function
# ============================

def evaluate(model, loader, criterion, device):
    model.eval()
    preds = []
    targets = []
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            out = model(data.x, data.edge_index, data.batch)
            preds.append(out.cpu().numpy())
            targets.append(data.y.cpu().numpy())
    preds = np.vstack(preds)
    targets = np.vstack(targets)
    mae = mean_absolute_error(targets, preds)
    mse = mean_squared_error(targets, preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(targets, preds)
    return mae, mse, rmse, r2

# ============================
# 6. Cross-Validation and Hyperparameter Search
# ============================

# Initialize KFold
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Initialize variables to store the best results
best_result = None
best_val_r2 = -np.inf

# Initialize a list to store all results
all_results = []

print("Starting randomized search with five-fold cross-validation...\n")
for idx, params in enumerate(tqdm(param_list, desc="Hyperparameter combinations")):
    fold_metrics = {
        'train_mae': [],
        'train_mse': [],
        'train_rmse': [],
        'train_r2': [],
        'val_mae': [],
        'val_mse': [],
        'val_rmse': [],
        'val_r2': []
    }
    
    # Convert the entire dataset to a NumPy array for indexing
    X = data_tensor[:, :-1].numpy()
    y = data_tensor[:, -1].numpy()
    
    # Perform K-Fold cross-validation
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        # Split data
        train_x = torch.tensor(X[train_idx], dtype=torch.float32)
        train_y = torch.tensor(y[train_idx], dtype=torch.float32).unsqueeze(1)
        val_x = torch.tensor(X[val_idx], dtype=torch.float32)
        val_y = torch.tensor(y[val_idx], dtype=torch.float32).unsqueeze(1)
        
        # Create masks
        num_nodes = data_tensor.size(0)
        train_mask = torch.zeros(num_nodes, dtype=torch.bool)
        val_mask = torch.zeros(num_nodes, dtype=torch.bool)
        train_mask[train_idx] = True
        val_mask[val_idx] = True
        
        # Create a single Data object with masks
        data = Data(x=data_tensor[:, :-1], edge_index=edge_index, y=data_tensor[:, -1].unsqueeze(1))
        data.train_mask = train_mask
        data.val_mask = val_mask
        data = data.to(device)
        
        # Create DataLoader for the entire graph
        loader = DataLoader([data], batch_size=1, shuffle=False)
        
        # Initialize the model
        model = FlexibleGNN(
            input_dim=X.shape[1],
            hidden_layers=params['hidden_layer_sizes'],
            dropout_rate=params['dropout_rate'],
            activation=params['activation'],
            num_layers=params['num_layers'],
            aggregation_type=params['aggregation_type']
        ).to(device)
        
        # Define optimizer
        if params['optimizer'] == 'adam':
            optimizer = torch.optim.Adam(model.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])
        elif params['optimizer'] == 'sgd':
            optimizer = torch.optim.SGD(model.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])
        elif params['optimizer'] == 'rmsprop':
            optimizer = torch.optim.RMSprop(model.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])
        else:
            raise ValueError(f"Unsupported optimizer: {params['optimizer']}")
        
        # Define learning rate scheduler
        if params['learning_rate_scheduler'] == 'constant':
            scheduler = None
        elif params['learning_rate_scheduler'] == 'step':
            scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1)
        elif params['learning_rate_scheduler'] == 'cosine':
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=params['num_epochs'])
        else:
            raise ValueError(f"Unsupported learning rate scheduler: {params['learning_rate_scheduler']}")
        
        # Define loss function
        criterion = torch.nn.MSELoss()
        
        # Training loop
        model.train()
        for epoch in range(params['num_epochs']):
            optimizer.zero_grad()
            out = model(data.x, data.edge_index, torch.zeros(data.x.size(0), dtype=torch.long).to(device))  # Batch can be zeros since it's a single graph
            loss = criterion(out[data.train_mask], data.y[data.train_mask])
            loss.backward()
            optimizer.step()
            if scheduler:
                scheduler.step()
        
        # Evaluate on training fold
        train_preds = out[data.train_mask].detach().cpu().numpy()
        train_targets = data.y[data.train_mask].detach().cpu().numpy()
        train_mae = mean_absolute_error(train_targets, train_preds)
        train_mse = mean_squared_error(train_targets, train_preds)
        train_rmse = np.sqrt(train_mse)
        train_r2 = r2_score(train_targets, train_preds)
        fold_metrics['train_mae'].append(train_mae)
        fold_metrics['train_mse'].append(train_mse)
        fold_metrics['train_rmse'].append(train_rmse)
        fold_metrics['train_r2'].append(train_r2)
        
        # Evaluate on validation fold
        val_preds = out[data.val_mask].detach().cpu().numpy()
        val_targets = data.y[data.val_mask].detach().cpu().numpy()
        val_mae = mean_absolute_error(val_targets, val_preds)
        val_mse = mean_squared_error(val_targets, val_preds)
        val_rmse = np.sqrt(val_mse)
        val_r2 = r2_score(val_targets, val_preds)
        fold_metrics['val_mae'].append(val_mae)
        fold_metrics['val_mse'].append(val_mse)
        fold_metrics['val_rmse'].append(val_rmse)
        fold_metrics['val_r2'].append(val_r2)
    
    # Aggregate metrics across folds
    avg_metrics = {metric: np.mean(values) for metric, values in fold_metrics.items()}
    avg_metrics['params'] = params
    all_results.append(avg_metrics)
    
    # Update best result based on validation R2
    if avg_metrics['val_r2'] > best_val_r2:
        best_val_r2 = avg_metrics['val_r2']
        best_result = avg_metrics

# ============================
# 7. Results Analysis
# ============================

# Convert all results to DataFrame
results_df = pd.DataFrame(all_results)

# Display top 10 hyperparameter combinations based on validation R²
top_results = results_df.sort_values(by='val_r2', ascending=False).head(10)
print("\nTop hyperparameter combinations based on average validation R²:")
print(top_results[['params', 'val_r2']])

# Save all results to Excel for further analysis
results_df.to_excel('EcoGNNknowledge_Model_Model_AllResults.xlsx', index=False)
print("\nAll hyperparameter search results saved to 'EcoGNNknowledge_Model_Model_AllResults.xlsx'")

# ============================
# 8. Final Model Training (Optional)
# ============================

# Optionally, retrain the model on the entire dataset using the best hyperparameters
# Note: This step is optional and depends on whether you need a final model for deployment

# Initialize the final model
final_model = FlexibleGNN(
    input_dim=data_tensor.size(1) - 1,  # Number of features
    hidden_layers=best_result['params']['hidden_layer_sizes'],
    dropout_rate=best_result['params']['dropout_rate'],
    activation=best_result['params']['activation'],
    num_layers=best_result['params']['num_layers'],
    aggregation_type=best_result['params']['aggregation_type']
).to(device)

# Define optimizer
if best_result['params']['optimizer'] == 'adam':
    optimizer = torch.optim.Adam(final_model.parameters(), lr=best_result['params']['learning_rate'], weight_decay=best_result['params']['weight_decay'])
elif best_result['params']['optimizer'] == 'sgd':
    optimizer = torch.optim.SGD(final_model.parameters(), lr=best_result['params']['learning_rate'], weight_decay=best_result['params']['weight_decay'])
elif best_result['params']['optimizer'] == 'rmsprop':
    optimizer = torch.optim.RMSprop(final_model.parameters(), lr=best_result['params']['learning_rate'], weight_decay=best_result['params']['weight_decay'])
else:
    raise ValueError(f"Unsupported optimizer: {best_result['params']['optimizer']}")

# Define learning rate scheduler
if best_result['params']['learning_rate_scheduler'] == 'constant':
    scheduler = None
elif best_result['params']['learning_rate_scheduler'] == 'step':
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1)
elif best_result['params']['learning_rate_scheduler'] == 'cosine':
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=best_result['params']['num_epochs'])
else:
    raise ValueError(f"Unsupported learning rate scheduler: {best_result['params']['learning_rate_scheduler']}")

# Define loss function
criterion = torch.nn.MSELoss()

# Create a single Data object for the entire dataset
full_data = Data(x=data_tensor[:, :-1], edge_index=edge_index, y=data_tensor[:, -1].unsqueeze(1)).to(device)

# Create DataLoader
final_loader = DataLoader([full_data], batch_size=1, shuffle=True)

# Training loop for the final model
final_model.train()
for epoch in range(best_result['params']['num_epochs']):
    optimizer.zero_grad()
    out = final_model(full_data.x, full_data.edge_index, torch.zeros(full_data.x.size(0), dtype=torch.long).to(device))  # Batch can be zeros since it's a single graph
    loss = criterion(out, full_data.y)
    loss.backward()
    optimizer.step()
    if scheduler:
        scheduler.step()
    if (epoch + 1) % 10 == 0 or epoch == 0:
        print(f"Epoch {epoch+1}/{best_result['params']['num_epochs']}, Loss: {loss.item():.4f}")

# ============================
# 9. Final Evaluation
# ============================

# Evaluate the final model using cross-validation metrics
# Since we've already used cross-validation to select hyperparameters, we'll skip re-evaluating

# However, if you retrain the model on the entire dataset, you can compute metrics directly

final_model.eval()
with torch.no_grad():
    out = final_model(full_data.x, full_data.edge_index, torch.zeros(full_data.x.size(0), dtype=torch.long).to(device))
    preds = out.cpu().numpy()
    targets = full_data.y.cpu().numpy()
    final_mae = mean_absolute_error(targets, preds)
    final_mse = mean_squared_error(targets, preds)
    final_rmse = np.sqrt(final_mse)
    final_r2 = r2_score(targets, preds)

print("\nFinal Model Performance on Entire Dataset:")
print(f"MAE: {final_mae:.4f}, MSE: {final_mse:.4f}, RMSE: {final_rmse:.4f}, R²: {final_r2:.4f}")

# ============================
# 10. Save Best Hyperparameters and Results
# ============================

# Save the best hyperparameters and performance metrics
results_to_save = {
    'Best Params': [best_result['params']],
    'Final MAE': [final_mae],
    'Final MSE': [final_mse],
    'Final RMSE': [final_rmse],
    'Final R²': [final_r2]
}

# Convert to DataFrame for saving
results_summary = pd.DataFrame(results_to_save)

# Save to Excel
results_summary.to_excel('EcoGNNknowledge_Model_BestResults.xlsx', index=False)
print("\nBest model results saved to 'EcoGNNknowledge_Model_BestResults.xlsx'")


In [None]:
import torch
import pandas as pd
import numpy as np
import random
from torch_geometric.nn import GATConv, global_mean_pool, global_sum_pool, global_max_pool
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold, ParameterSampler
from torch_geometric.data import Data, DataLoader
from tqdm import tqdm

# ============================
# 1. Setup and Configuration
# ============================

# Set random seeds for reproducibility
def set_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

set_seed(42)

# Define the hyperparameter search space
search_space = {
    'hidden_layer_sizes': [(64,), (128,), (64, 64), (128, 128), (64, 128, 64)],
    'learning_rate': [0.001, 0.01, 0.1],
    'num_epochs': [50, 100, 200],
    'batch_size': [16, 32, 64, 128],
    'dropout_rate': [0.0, 0.2, 0.5],
    'weight_decay': [0.0, 0.0001, 0.001, 0.01],
    'activation': ['relu', 'tanh', 'sigmoid'],
    'num_layers': [2, 3, 4],
    'aggregation_type': ['mean', 'sum', 'max'],
    'optimizer': ['adam', 'sgd', 'rmsprop'],
    'learning_rate_scheduler': ['constant', 'step', 'cosine']
}

# Define the number of random samples from the search space
n_iter = 100  # Adjust based on computational resources

# Generate random hyperparameter combinations
param_list = list(ParameterSampler(search_space, n_iter=n_iter, random_state=42))

# Number of folds for cross-validation
num_folds = 5

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# ============================
# 2. Data Loading and Processing
# ============================

# Read the single dataset file
data_path = 'path_to_your_data.csv'  # Replace with your actual file path
data_df = pd.read_csv(data_path).dropna()

# Convert DataFrame to tensor
data_tensor = torch.tensor(data_df.values, dtype=torch.float32)

# Define feature names and mapping (ensure these match your CSV columns)
features = ['ORP', 'V', 'DO', 'pH', 'SF', 'Spro', 'Sac', 'Sh', 'SSO4', 'SH2S', 'XS', 'SCH4']
node_mapping = {feat: idx for idx, feat in enumerate(features)}

# ============================
# 2.1. Define and Modify Relationships
# ============================

# Define external relationships to retain
external_relationships = [
    ('ORP', 'SF'), ('ORP', 'Spro'), ('ORP', 'Sac'), ('ORP', 'Sh'),
    ('V', 'Sac'), ('V', 'Sh'), ('V', 'SSO4'), ('V', 'SH2S'), ('V', 'XS'), ('V', 'SCH4'),
    ('DO', 'SF'), ('DO', 'Spro'), ('DO', 'Sac'), ('DO', 'Sh'), ('DO', 'SH2S'), ('DO', 'XS'), ('DO', 'SCH4'),
    ('pH', 'SF'), ('pH', 'Spro'), ('pH', 'Sac'), ('pH', 'Sh'), ('pH', 'SSO4'), ('pH', 'XS')
]

# Define internal relationships to remove
internal_relationships_to_remove = [
    ('SF', 'Spro'), ('SF', 'Sac'), ('Sac', 'SH2S'), ('SSO4', 'SH2S'), ('Sh', 'SH2S'),
    ('XS', 'SF'), ('SH2S', 'SCH4'), ('Sac', 'SCH4'), ('Sh', 'SCH4'), ('SF', 'Sh')
]

# Define internal features
internal_features = ['SF', 'Spro', 'Sac', 'Sh', 'SSO4', 'SH2S', 'XS', 'SCH4']

# Generate all possible internal single-direction relationships excluding self-links
all_possible_internal_relationships = [
    (src, dst) for src in internal_features for dst in internal_features
    if src != dst
]

# Remove the relationships that are to be excluded
remaining_internal_relationships = list(set(all_possible_internal_relationships) - set(internal_relationships_to_remove))

# Check if there are enough possible relationships to sample
if len(remaining_internal_relationships) < 10:
    raise ValueError("Not enough possible internal relationships to sample from.")

# Randomly sample 10 new internal relationships
new_internal_relationships = random.sample(remaining_internal_relationships, 10)

# Combine external relationships with new internal relationships
relationships = external_relationships + new_internal_relationships

print(f"Total relationships after modification: {len(relationships)}")
print("Relationships:")
for rel in relationships:
    print(rel)

# Generate edge_index tensor
edge_index = torch.tensor([[node_mapping[src], node_mapping[dst]] for src, dst in relationships], dtype=torch.long).t().contiguous()

# ============================
# 3. Model Definition
# ============================

class FlexibleGNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_layers, dropout_rate, activation, num_layers, aggregation_type):
        super(FlexibleGNN, self).__init__()
        self.num_layers = num_layers
        self.aggregation_type = aggregation_type
        self.activation = activation
        self.dropout_rate = dropout_rate

        # Activation function
        if activation == 'relu':
            self.activation_fn = torch.relu
        elif activation == 'tanh':
            self.activation_fn = torch.tanh
        elif activation == 'sigmoid':
            self.activation_fn = torch.sigmoid
        else:
            raise ValueError(f"Unsupported activation: {activation}")

        # Aggregation function
        if aggregation_type == 'mean':
            self.agg_fn = global_mean_pool
        elif aggregation_type == 'sum':
            self.agg_fn = global_sum_pool
        elif aggregation_type == 'max':
            self.agg_fn = global_max_pool
        else:
            raise ValueError(f"Unsupported aggregation type: {aggregation_type}")

        # Define GAT layers
        self.gat_layers = torch.nn.ModuleList()
        self.activations = torch.nn.ModuleList()
        self.dropouts = torch.nn.ModuleList()
        prev_dim = input_dim
        for i in range(num_layers):
            out_dim = hidden_layers[i] if i < len(hidden_layers) else hidden_layers[-1]
            heads = 8  # You can make this a hyperparameter if desired
            concat = True if i < num_layers - 1 else False  # Don't concatenate in the last layer
            self.gat_layers.append(GATConv(prev_dim, out_dim, heads=heads, concat=concat, dropout=dropout_rate))
            prev_dim = out_dim * heads if concat else out_dim

        # Define a fully connected layer for regression
        self.fc = torch.nn.Linear(prev_dim, 1)

        # Dropout layer
        self.dropout = torch.nn.Dropout(p=dropout_rate)

    def forward(self, x, edge_index, batch):
        for gat in self.gat_layers:
            x = gat(x, edge_index)
            x = self.activation_fn(x)
            x = self.dropout(x)
        x = self.agg_fn(x, batch)
        x = self.fc(x)
        return x

# ============================
# 4. Dataset Preparation
# ============================

class GraphDataset(torch.utils.data.Dataset):
    def __init__(self, data_tensor, edge_index, target_col=-1):
        super(GraphDataset, self).__init__()
        self.x = data_tensor[:, :-1]
        self.y = data_tensor[:, target_col].unsqueeze(1)
        self.edge_index = edge_index

    def __len__(self):
        return 1  # Single graph

    def __getitem__(self, idx):
        data = Data(x=self.x, edge_index=self.edge_index, y=self.y)
        return data

dataset = GraphDataset(data_tensor, edge_index)

# ============================
# 5. Evaluation Function
# ============================

def evaluate(model, loader, criterion, device):
    model.eval()
    preds = []
    targets = []
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            out = model(data.x, data.edge_index, data.batch)
            preds.append(out.cpu().numpy())
            targets.append(data.y.cpu().numpy())
    preds = np.vstack(preds)
    targets = np.vstack(targets)
    mae = mean_absolute_error(targets, preds)
    mse = mean_squared_error(targets, preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(targets, preds)
    return mae, mse, rmse, r2

# ============================
# 6. Cross-Validation and Hyperparameter Search
# ============================

# Initialize KFold
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Initialize variables to store the best results
best_result = None
best_val_r2 = -np.inf

# Initialize a list to store all results
all_results = []

print("Starting randomized search with five-fold cross-validation...\n")
for idx, params in enumerate(tqdm(param_list, desc="Hyperparameter combinations")):
    fold_metrics = {
        'train_mae': [],
        'train_mse': [],
        'train_rmse': [],
        'train_r2': [],
        'val_mae': [],
        'val_mse': [],
        'val_rmse': [],
        'val_r2': []
    }
    
    # Convert the entire dataset to a NumPy array for indexing
    X = data_tensor[:, :-1].numpy()
    y = data_tensor[:, -1].numpy()
    
    # Perform K-Fold cross-validation
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        # Split data
        train_x = torch.tensor(X[train_idx], dtype=torch.float32)
        train_y = torch.tensor(y[train_idx], dtype=torch.float32).unsqueeze(1)
        val_x = torch.tensor(X[val_idx], dtype=torch.float32)
        val_y = torch.tensor(y[val_idx], dtype=torch.float32).unsqueeze(1)
        
        # Create masks
        num_nodes = data_tensor.size(0)
        train_mask = torch.zeros(num_nodes, dtype=torch.bool)
        val_mask = torch.zeros(num_nodes, dtype=torch.bool)
        train_mask[train_idx] = True
        val_mask[val_idx] = True
        
        # Create a single Data object with masks
        data = Data(x=data_tensor[:, :-1], edge_index=edge_index, y=data_tensor[:, -1].unsqueeze(1))
        data.train_mask = train_mask
        data.val_mask = val_mask
        data = data.to(device)
        
        # Create DataLoader for the entire graph
        loader = DataLoader([data], batch_size=1, shuffle=False)
        
        # Initialize the model
        model = FlexibleGNN(
            input_dim=X.shape[1],
            hidden_layers=params['hidden_layer_sizes'],
            dropout_rate=params['dropout_rate'],
            activation=params['activation'],
            num_layers=params['num_layers'],
            aggregation_type=params['aggregation_type']
        ).to(device)
        
        # Define optimizer
        if params['optimizer'] == 'adam':
            optimizer = torch.optim.Adam(model.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])
        elif params['optimizer'] == 'sgd':
            optimizer = torch.optim.SGD(model.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])
        elif params['optimizer'] == 'rmsprop':
            optimizer = torch.optim.RMSprop(model.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])
        else:
            raise ValueError(f"Unsupported optimizer: {params['optimizer']}")
        
        # Define learning rate scheduler
        if params['learning_rate_scheduler'] == 'constant':
            scheduler = None
        elif params['learning_rate_scheduler'] == 'step':
            scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1)
        elif params['learning_rate_scheduler'] == 'cosine':
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=params['num_epochs'])
        else:
            raise ValueError(f"Unsupported learning rate scheduler: {params['learning_rate_scheduler']}")
        
        # Define loss function
        criterion = torch.nn.MSELoss()
        
        # Training loop
        model.train()
        for epoch in range(params['num_epochs']):
            optimizer.zero_grad()
            out = model(data.x, data.edge_index, torch.zeros(data.x.size(0), dtype=torch.long).to(device))  # Batch can be zeros since it's a single graph
            loss = criterion(out[data.train_mask], data.y[data.train_mask])
            loss.backward()
            optimizer.step()
            if scheduler:
                scheduler.step()
        
        # Evaluate on training fold
        train_preds = out[data.train_mask].detach().cpu().numpy()
        train_targets = data.y[data.train_mask].detach().cpu().numpy()
        train_mae = mean_absolute_error(train_targets, train_preds)
        train_mse = mean_squared_error(train_targets, train_preds)
        train_rmse = np.sqrt(train_mse)
        train_r2 = r2_score(train_targets, train_preds)
        fold_metrics['train_mae'].append(train_mae)
        fold_metrics['train_mse'].append(train_mse)
        fold_metrics['train_rmse'].append(train_rmse)
        fold_metrics['train_r2'].append(train_r2)
        
        # Evaluate on validation fold
        val_preds = out[data.val_mask].detach().cpu().numpy()
        val_targets = data.y[data.val_mask].detach().cpu().numpy()
        val_mae = mean_absolute_error(val_targets, val_preds)
        val_mse = mean_squared_error(val_targets, val_preds)
        val_rmse = np.sqrt(val_mse)
        val_r2 = r2_score(val_targets, val_preds)
        fold_metrics['val_mae'].append(val_mae)
        fold_metrics['val_mse'].append(val_mse)
        fold_metrics['val_rmse'].append(val_rmse)
        fold_metrics['val_r2'].append(val_r2)
    
    # Aggregate metrics across folds
    avg_metrics = {metric: np.mean(values) for metric, values in fold_metrics.items()}
    avg_metrics['params'] = params
    all_results.append(avg_metrics)
    
    # Update best result based on validation R2
    if avg_metrics['val_r2'] > best_val_r2:
        best_val_r2 = avg_metrics['val_r2']
        best_result = avg_metrics

# ============================
# 7. Results Analysis
# ============================

# Convert all results to DataFrame
results_df = pd.DataFrame(all_results)

# Display top 10 hyperparameter combinations based on validation R²
top_results = results_df.sort_values(by='val_r2', ascending=False).head(10)
print("\nTop hyperparameter combinations based on average validation R²:")
print(top_results[['params', 'val_r2']])

# Save all results to Excel for further analysis
results_df.to_excel('EcoGNNSEM_Model_AllResults.xlsx', index=False)
print("\nAll hyperparameter search results saved to 'EcoGNNSEM_Model_AllResults.xlsx'")

# Extract best hyperparameters
best_params = best_result['params']
print(f"\nBest Hyperparameters: {best_params}")

# ============================
# 8. Final Model Training (Optional)
# ============================

# Optionally, retrain the model on the entire dataset using the best hyperparameters
# Note: This step is optional and depends on whether you need a final model for deployment

# Initialize the final model
final_model = FlexibleGNN(
    input_dim=data_tensor.size(1) - 1,  # Number of features
    hidden_layers=best_params['hidden_layer_sizes'],
    dropout_rate=best_params['dropout_rate'],
    activation=best_params['activation'],
    num_layers=best_params['num_layers'],
    aggregation_type=best_params['aggregation_type']
).to(device)

# Define optimizer
if best_params['optimizer'] == 'adam':
    optimizer = torch.optim.Adam(final_model.parameters(), lr=best_params['learning_rate'], weight_decay=best_params['weight_decay'])
elif best_params['optimizer'] == 'sgd':
    optimizer = torch.optim.SGD(final_model.parameters(), lr=best_params['learning_rate'], weight_decay=best_params['weight_decay'])
elif best_params['optimizer'] == 'rmsprop':
    optimizer = torch.optim.RMSprop(final_model.parameters(), lr=best_params['learning_rate'], weight_decay=best_params['weight_decay'])
else:
    raise ValueError(f"Unsupported optimizer: {best_params['optimizer']}")

# Define learning rate scheduler
if best_params['learning_rate_scheduler'] == 'constant':
    scheduler = None
elif best_params['learning_rate_scheduler'] == 'step':
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1)
elif best_params['learning_rate_scheduler'] == 'cosine':
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=best_params['num_epochs'])
else:
    raise ValueError(f"Unsupported learning rate scheduler: {best_params['learning_rate_scheduler']}")

# Define loss function
criterion = torch.nn.MSELoss()

# Create a single Data object for the entire dataset
full_data = Data(x=data_tensor[:, :-1], edge_index=edge_index, y=data_tensor[:, -1].unsqueeze(1)).to(device)

# Create DataLoader
final_loader = DataLoader([full_data], batch_size=1, shuffle=True)

# Training loop for the final model
final_model.train()
for epoch in range(best_params['num_epochs']):
    optimizer.zero_grad()
    out = final_model(full_data.x, full_data.edge_index, torch.zeros(full_data.x.size(0), dtype=torch.long).to(device))  # Batch can be zeros since it's a single graph
    loss = criterion(out, full_data.y)
    loss.backward()
    optimizer.step()
    if scheduler:
        scheduler.step()
    if (epoch + 1) % 10 == 0 or epoch == 0:
        print(f"Epoch {epoch+1}/{best_params['num_epochs']}, Loss: {loss.item():.4f}")

# ============================
# 9. Final Evaluation
# ============================

# Evaluate the final model using cross-validation metrics
# Since we've already used cross-validation to select hyperparameters, we'll skip re-evaluating

# However, if you retrain the model on the entire dataset, you can compute metrics directly

final_model.eval()
with torch.no_grad():
    out = final_model(full_data.x, full_data.edge_index, torch.zeros(full_data.x.size(0), dtype=torch.long).to(device))
    preds = out.cpu().numpy()
    targets = full_data.y.cpu().numpy()
    final_mae = mean_absolute_error(targets, preds)
    final_mse = mean_squared_error(targets, preds)
    final_rmse = np.sqrt(final_mse)
    final_r2 = r2_score(targets, preds)

print("\nFinal Model Performance on Entire Dataset:")
print(f"MAE: {final_mae:.4f}, MSE: {final_mse:.4f}, RMSE: {final_rmse:.4f}, R²: {final_r2:.4f}")

# ============================
# 10. Save Best Hyperparameters and Results
# ============================

# Save the best hyperparameters and performance metrics
results_to_save = {
    'Best Params': [best_params],
    'Final MAE': [final_mae],
    'Final MSE': [final_mse],
    'Final RMSE': [final_rmse],
    'Final R²': [final_r2]
}

# Convert to DataFrame for saving
results_summary = pd.DataFrame(results_to_save)

# Save to Excel
results_summary.to_excel('EcoGNNSEM_Model_BestResults.xlsx', index=False)
print("\nBest model results saved to 'EcoGNNSEM_Model_BestResults.xlsx'")


In [None]:
import torch
import pandas as pd
import numpy as np
import random
from torch_geometric.nn import GATConv, global_mean_pool, global_sum_pool, global_max_pool
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold, ParameterSampler
from torch_geometric.data import Data, DataLoader
from tqdm import tqdm

# ============================
# 1. Setup and Configuration
# ============================

# Set random seeds for reproducibility
def set_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

set_seed(42)

# Define the hyperparameter search space
search_space = {
    'hidden_layer_sizes': [(64,), (128,), (64, 64), (128, 128), (64, 128, 64)],
    'learning_rate': [0.001, 0.01, 0.1],
    'num_epochs': [50, 100, 200],
    'batch_size': [16, 32, 64, 128],
    'dropout_rate': [0.0, 0.2, 0.5],
    'weight_decay': [0.0, 0.0001, 0.001, 0.01],
    'activation': ['relu', 'tanh', 'sigmoid'],
    'num_layers': [2, 3, 4],
    'aggregation_type': ['mean', 'sum', 'max'],
    'optimizer': ['adam', 'sgd', 'rmsprop'],
    'learning_rate_scheduler': ['constant', 'step', 'cosine']
}

# Define the number of random samples from the search space
n_iter = 100  # Adjust based on computational resources

# Generate random hyperparameter combinations
param_list = list(ParameterSampler(search_space, n_iter=n_iter, random_state=42))

# Number of folds for cross-validation
num_folds = 5

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# ============================
# 2. Data Loading and Processing
# ============================

# Read the single dataset file
data_path = 'path_to_your_data.csv'  # Replace with your actual file path
data_df = pd.read_csv(data_path).dropna()

# Convert DataFrame to tensor
data_tensor = torch.tensor(data_df.values, dtype=torch.float32)

# Define feature names and mapping (ensure these match your CSV columns)
features = ['ORP', 'V', 'DO', 'pH', 'SF', 'Spro', 'Sac', 'Sh', 'SSO4', 'SH2S', 'XS', 'SCH4']
node_mapping = {feat: idx for idx, feat in enumerate(features)}

# Define causal relationships (edges)
relationships = [
    ('ORP', 'SF'), ('ORP', 'Spro'), ('ORP', 'Sac'), ('ORP', 'Sh'),
    ('V', 'Sac'), ('V', 'Sh'), ('V', 'SSO4'), ('V', 'SH2S'), ('V', 'XS'), ('V', 'SCH4'),
    ('DO', 'SF'), ('DO', 'Spro'), ('DO', 'Sac'), ('DO', 'Sh'), ('DO', 'SH2S'), ('DO', 'XS'), ('DO', 'SCH4'),
    ('pH', 'SF'), ('pH', 'Spro'), ('pH', 'Sac'), ('pH', 'Sh'), ('pH', 'SSO4'), ('pH', 'XS'),
    ('SF', 'Spro'), ('SF', 'Sac'), ('Sac', 'SH2S'), ('SSO4', 'SH2S'), ('Sh', 'SH2S'),
    ('XS', 'SF'), ('SH2S', 'SCH4'), ('Sac', 'SCH4'), ('Sh', 'SCH4'), ('SF', 'Sh')
]

# Function to generate random edges
def generate_random_edges(node_mapping, existing_edges, num_random):
    all_nodes = list(node_mapping.values())
    existing_edge_set = set(existing_edges)
    random_edges = set()
    
    while len(random_edges) < num_random:
        src = random.choice(all_nodes)
        dst = random.choice(all_nodes)
        if src == dst:
            continue  # Skip self-loops
        edge = (src, dst)
        if edge in existing_edge_set or edge in random_edges:
            continue  # Skip existing edges and duplicates
        random_edges.add(edge)
    
    return list(random_edges)

# Convert feature-based relationships to index-based edges
existing_edges = [(node_mapping[src], node_mapping[dst]) for src, dst in relationships]

# Number of random edges to generate
num_random_edges = len(existing_edges)

# Generate random edges
random_edges = generate_random_edges(node_mapping, existing_edges, num_random_edges)

# Combine existing edges with random edges
all_edges = existing_edges + random_edges

# Convert edge list to tensor
edge_index = torch.tensor(all_edges, dtype=torch.long).t().contiguous()

# ============================
# 3. Model Definition
# ============================

class FlexibleGNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_layers, dropout_rate, activation, num_layers, aggregation_type):
        super(FlexibleGNN, self).__init__()
        self.num_layers = num_layers
        self.aggregation_type = aggregation_type
        self.activation = activation
        self.dropout_rate = dropout_rate

        # Activation function
        if activation == 'relu':
            self.activation_fn = torch.relu
        elif activation == 'tanh':
            self.activation_fn = torch.tanh
        elif activation == 'sigmoid':
            self.activation_fn = torch.sigmoid
        else:
            raise ValueError(f"Unsupported activation: {activation}")

        # Aggregation function
        if aggregation_type == 'mean':
            self.agg_fn = global_mean_pool
        elif aggregation_type == 'sum':
            self.agg_fn = global_sum_pool
        elif aggregation_type == 'max':
            self.agg_fn = global_max_pool
        else:
            raise ValueError(f"Unsupported aggregation type: {aggregation_type}")

        # Define GAT layers
        self.gat_layers = torch.nn.ModuleList()
        prev_dim = input_dim
        for i in range(num_layers):
            out_dim = hidden_layers[i] if i < len(hidden_layers) else hidden_layers[-1]
            heads = 8  # You can make this a hyperparameter if desired
            concat = True if i < num_layers - 1 else False  # Don't concatenate in the last layer
            self.gat_layers.append(GATConv(prev_dim, out_dim, heads=heads, concat=concat, dropout=dropout_rate))
            prev_dim = out_dim * heads if concat else out_dim

        # Define a fully connected layer for regression
        self.fc = torch.nn.Linear(prev_dim, 1)

        # Dropout layer
        self.dropout = torch.nn.Dropout(p=dropout_rate)

    def forward(self, x, edge_index, batch):
        for gat in self.gat_layers:
            x = gat(x, edge_index)
            x = self.activation_fn(x)
            x = self.dropout(x)
        x = self.agg_fn(x, batch)
        x = self.fc(x)
        return x

# ============================
# 4. Dataset Preparation
# ============================

class GraphDataset(torch.utils.data.Dataset):
    def __init__(self, data_tensor, edge_index, target_col=-1):
        super(GraphDataset, self).__init__()
        self.x = data_tensor[:, :-1]
        self.y = data_tensor[:, target_col].unsqueeze(1)
        self.edge_index = edge_index

    def __len__(self):
        return 1  # Single graph

    def __getitem__(self, idx):
        data = Data(x=self.x, edge_index=self.edge_index, y=self.y)
        return data

dataset = GraphDataset(data_tensor, edge_index)

# ============================
# 5. Evaluation Function
# ============================

def evaluate(model, loader, criterion, device):
    model.eval()
    preds = []
    targets = []
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            out = model(data.x, data.edge_index, data.batch)
            preds.append(out.cpu().numpy())
            targets.append(data.y.cpu().numpy())
    preds = np.vstack(preds)
    targets = np.vstack(targets)
    mae = mean_absolute_error(targets, preds)
    mse = mean_squared_error(targets, preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(targets, preds)
    return mae, mse, rmse, r2

# ============================
# 6. Cross-Validation and Hyperparameter Search
# ============================

# Initialize KFold
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Initialize variables to store the best results
best_result = None
best_val_r2 = -np.inf

# Initialize a list to store all results
all_results = []

print("Starting randomized search with five-fold cross-validation...\n")
for idx, params in enumerate(tqdm(param_list, desc="Hyperparameter combinations")):
    fold_metrics = {
        'train_mae': [],
        'train_mse': [],
        'train_rmse': [],
        'train_r2': [],
        'val_mae': [],
        'val_mse': [],
        'val_rmse': [],
        'val_r2': []
    }
    
    # Convert the entire dataset to a NumPy array for indexing
    X = data_tensor[:, :-1].numpy()
    y = data_tensor[:, -1].numpy()
    
    # Perform K-Fold cross-validation
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        # Split data
        train_x = torch.tensor(X[train_idx], dtype=torch.float32)
        train_y = torch.tensor(y[train_idx], dtype=torch.float32).unsqueeze(1)
        val_x = torch.tensor(X[val_idx], dtype=torch.float32)
        val_y = torch.tensor(y[val_idx], dtype=torch.float32).unsqueeze(1)
        
        # Create masks
        num_nodes = data_tensor.size(0)
        train_mask = torch.zeros(num_nodes, dtype=torch.bool)
        val_mask = torch.zeros(num_nodes, dtype=torch.bool)
        train_mask[train_idx] = True
        val_mask[val_idx] = True
        
        # Create a single Data object with masks
        data = Data(x=data_tensor[:, :-1], edge_index=edge_index, y=data_tensor[:, -1].unsqueeze(1))
        data.train_mask = train_mask
        data.val_mask = val_mask
        data = data.to(device)
        
        # Create DataLoader for the entire graph
        loader = DataLoader([data], batch_size=1, shuffle=False)
        
        # Initialize the model
        model = FlexibleGNN(
            input_dim=X.shape[1],
            hidden_layers=params['hidden_layer_sizes'],
            dropout_rate=params['dropout_rate'],
            activation=params['activation'],
            num_layers=params['num_layers'],
            aggregation_type=params['aggregation_type']
        ).to(device)
        
        # Define optimizer
        if params['optimizer'] == 'adam':
            optimizer = torch.optim.Adam(model.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])
        elif params['optimizer'] == 'sgd':
            optimizer = torch.optim.SGD(model.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])
        elif params['optimizer'] == 'rmsprop':
            optimizer = torch.optim.RMSprop(model.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])
        else:
            raise ValueError(f"Unsupported optimizer: {params['optimizer']}")
        
        # Define learning rate scheduler
        if params['learning_rate_scheduler'] == 'constant':
            scheduler = None
        elif params['learning_rate_scheduler'] == 'step':
            scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1)
        elif params['learning_rate_scheduler'] == 'cosine':
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=params['num_epochs'])
        else:
            raise ValueError(f"Unsupported learning rate scheduler: {params['learning_rate_scheduler']}")
        
        # Define loss function
        criterion = torch.nn.MSELoss()
        
        # Training loop
        model.train()
        for epoch in range(params['num_epochs']):
            optimizer.zero_grad()
            out = model(data.x, data.edge_index, torch.zeros(data.x.size(0), dtype=torch.long).to(device))  # Batch can be zeros since it's a single graph
            loss = criterion(out[data.train_mask], data.y[data.train_mask])
            loss.backward()
            optimizer.step()
            if scheduler:
                scheduler.step()
        
        # Evaluate on training fold
        train_preds = out[data.train_mask].detach().cpu().numpy()
        train_targets = data.y[data.train_mask].detach().cpu().numpy()
        train_mae = mean_absolute_error(train_targets, train_preds)
        train_mse = mean_squared_error(train_targets, train_preds)
        train_rmse = np.sqrt(train_mse)
        train_r2 = r2_score(train_targets, train_preds)
        fold_metrics['train_mae'].append(train_mae)
        fold_metrics['train_mse'].append(train_mse)
        fold_metrics['train_rmse'].append(train_rmse)
        fold_metrics['train_r2'].append(train_r2)
        
        # Evaluate on validation fold
        val_preds = out[data.val_mask].detach().cpu().numpy()
        val_targets = data.y[data.val_mask].detach().cpu().numpy()
        val_mae = mean_absolute_error(val_targets, val_preds)
        val_mse = mean_squared_error(val_targets, val_preds)
        val_rmse = np.sqrt(val_mse)
        val_r2 = r2_score(val_targets, val_preds)
        fold_metrics['val_mae'].append(val_mae)
        fold_metrics['val_mse'].append(val_mse)
        fold_metrics['val_rmse'].append(val_rmse)
        fold_metrics['val_r2'].append(val_r2)
    
    # Aggregate metrics across folds
    avg_metrics = {metric: np.mean(values) for metric, values in fold_metrics.items()}
    avg_metrics['params'] = params
    all_results.append(avg_metrics)
    
    # Update best result based on validation R2
    if avg_metrics['val_r2'] > best_val_r2:
        best_val_r2 = avg_metrics['val_r2']
        best_result = avg_metrics

# ============================
# 7. Results Analysis
# ============================

# Convert all results to DataFrame
results_df = pd.DataFrame(all_results)

# Display top 10 hyperparameter combinations based on validation R²
top_results = results_df.sort_values(by='val_r2', ascending=False).head(10)
print("\nTop hyperparameter combinations based on average validation R²:")
print(top_results[['params', 'val_r2']])

# Save all results to Excel for further analysis
results_df.to_excel('EcoGNNRandom_Model_AllResults.xlsx', index=False)
print("\nAll hyperparameter search results saved to 'EcoGNNRandom_Model_AllResults.xlsx'")

# Save top 10 results
top_results.to_excel('EcoGNNRandom_Model_Top10Results.xlsx', index=False)
print("Top 10 hyperparameter combinations have been saved to 'EcoGNNRandom_Model_Top10Results.xlsx'")

# Extract best hyperparameters
best_params = best_result['params']
print(f"\nBest Hyperparameters: {best_params}")

# ============================
# 8. Final Model Training (Optional)
# ============================

# Optionally, retrain the model on the entire dataset using the best hyperparameters
# Note: This step is optional and depends on whether you need a final model for deployment

# Initialize the final model
final_model = FlexibleGNN(
    input_dim=data_tensor.size(1) - 1,  # Number of features
    hidden_layers=best_params['hidden_layer_sizes'],
    dropout_rate=best_params['dropout_rate'],
    activation=best_params['activation'],
    num_layers=best_params['num_layers'],
    aggregation_type=best_params['aggregation_type']
).to(device)

# Define optimizer
if best_params['optimizer'] == 'adam':
    optimizer = torch.optim.Adam(final_model.parameters(), lr=best_params['learning_rate'], weight_decay=best_params['weight_decay'])
elif best_params['optimizer'] == 'sgd':
    optimizer = torch.optim.SGD(final_model.parameters(), lr=best_params['learning_rate'], weight_decay=best_params['weight_decay'])
elif best_params['optimizer'] == 'rmsprop':
    optimizer = torch.optim.RMSprop(final_model.parameters(), lr=best_params['learning_rate'], weight_decay=best_params['weight_decay'])
else:
    raise ValueError(f"Unsupported optimizer: {best_params['optimizer']}")

# Define learning rate scheduler
if best_params['learning_rate_scheduler'] == 'constant':
    scheduler = None
elif best_params['learning_rate_scheduler'] == 'step':
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1)
elif best_params['learning_rate_scheduler'] == 'cosine':
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=best_params['num_epochs'])
else:
    raise ValueError(f"Unsupported learning rate scheduler: {best_params['learning_rate_scheduler']}")

# Define loss function
criterion = torch.nn.MSELoss()

# Create a single Data object for the entire dataset
full_data = Data(x=data_tensor[:, :-1], edge_index=edge_index, y=data_tensor[:, -1].unsqueeze(1)).to(device)

# Create DataLoader
final_loader = DataLoader([full_data], batch_size=1, shuffle=True)

# Training loop for the final model
final_model.train()
for epoch in range(best_params['num_epochs']):
    optimizer.zero_grad()
    out = final_model(full_data.x, full_data.edge_index, torch.zeros(full_data.x.size(0), dtype=torch.long).to(device))  # Batch can be zeros since it's a single graph
    loss = criterion(out, full_data.y)
    loss.backward()
    optimizer.step()
    if scheduler:
        scheduler.step()
    if (epoch + 1) % 10 == 0 or epoch == 0:
        print(f"Epoch {epoch+1}/{best_params['num_epochs']}, Loss: {loss.item():.4f}")

# ============================
# 9. Final Evaluation
# ============================

# Evaluate the final model using cross-validation metrics
# Since we've already used cross-validation to select hyperparameters, we'll skip re-evaluating

# However, if you retrain the model on the entire dataset, you can compute metrics directly

final_model.eval()
with torch.no_grad():
    out = final_model(full_data.x, full_data.edge_index, torch.zeros(full_data.x.size(0), dtype=torch.long).to(device))
    preds = out.cpu().numpy()
    targets = full_data.y.cpu().numpy()
    final_mae = mean_absolute_error(targets, preds)
    final_mse = mean_squared_error(targets, preds)
    final_rmse = np.sqrt(final_mse)
    final_r2 = r2_score(targets, preds)

print("\nFinal Model Performance on Entire Dataset:")
print(f"MAE: {final_mae:.4f}, MSE: {final_mse:.4f}, RMSE: {final_rmse:.4f}, R²: {final_r2:.4f}")

# ============================
# 10. Save Best Hyperparameters and Results
# ============================

# Save the best hyperparameters and performance metrics
results_to_save = {
    'Best Params': [best_params],
    'Final MAE': [final_mae],
    'Final MSE': [final_mse],
    'Final RMSE': [final_rmse],
    'Final R²': [final_r2]
}

# Convert to DataFrame for saving
results_summary = pd.DataFrame(results_to_save)

# Save to Excel
results_summary.to_excel('EcoGNNRandom_Model_BestResults.xlsx', index=False)
print("\nBest model results saved to 'EcoGNNRandom_Model_BestResults.xlsx'")


In [None]:
import pandas as pd
import numpy as np
from scipy.integrate import solve_ivp
from deap import base, creator, tools, algorithms
import random
import multiprocessing

# Define the SewerX function representing the mechanistic model
def SewerX(t, y, kHAC, kSRBAC, kMPBAC):
    # Unpack state variables
    SF, Spro, Sac, Sh, SCH4, SSO4, SH2S, Se, XS, XI, Xaci, Xace, XMAac, XMAh, XSRBpr, XSRBac, XSRBh = y
    
    # Define constants
    khydro = 3
    kaci = 6
    kace = 6
    KF = 10
    # Use optimization parameters
    kMAac = kHAC
    KMAac = 210
    kMAh = kSRBAC
    KMAh = 0.1
    kSRBpro = kMPBAC
    KSRBpro = 110
    kSRBac = 7.1
    KSRBac = 220
    kSRBh = 26.7
    KSRBh = 0.1
    KSO4 = 1.8
    kh2s = 1
    kdecaci = 0.02
    kdecace = 0.02
    kdecMAac = 0.015
    kdecMAh = 0.01
    kdecSRBpro = 0.010
    kdecSRBac = 0.015
    kdecSRBh = 0.01
    f1 = 0.78
    f2 = 0.22
    f3 = 0.67
    f4 = 0.33
    n = 1.65
    Yaci = 0.1
    Yace = 0.06
    YMAac = 0.0317
    YMAh = 0.0403
    YSRBac = 0.0329
    YSRBpro = 0.0342
    YSRBh = 0.0366

    # Define the A matrix based on the model equations
    A = np.array([
        [1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0],
        [-1, f1*(1-Yaci), f2*(1-Yaci), 0, 0, 0, 0, 0, 0, 0, Yaci, 0, 0, 0, 0, 0, 0],
        [-1, 0, f3*(1-Yace), f4*(1-Yace), 0, 0, 0, 0, 0, 0, 0, Yace, 0, 0, 0, 0, 0],
        [0, 0, -1, 0, 1-YMAac, 0, 0, 0, 0, 0, 0, 0, YMAac, 0, 0, 0, 0],
        [0, 0, 0, -1, 1-YMAh, 0, 0, 0, 0, 0, 0, 0, 0, YMAh, 0, 0, 0],
        [0, -1, 0, 0, 0, (YSRBpro-1)/2, (1-YSRBpro)/2, 0, 0, 0, 0, 0, 0, 0, YSRBpro, 0, 0],
        [0, 0, -1, 0, 0, (YSRBac-1)/2, (1-YSRBac)/2, 0, 0, 0, 0, 0, 0, 0, 0, YSRBac, 0],
        [0, 0, 0, -1, 0, (YSRBh-1)/2, (1-YSRBh)/2, 0, 0, 0, 0, 0, 0, 0, 0, 0, YSRBh],
        [0, 0, 0, 0, 0, 0, -1, -n, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0.9, 0.1, -1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0.9, 0.1, 0, -1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0.9, 0.1, 0, 0, -1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0.9, 0.1, 0, 0, 0, -1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0.9, 0.1, 0, 0, 0, 0, -1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0.9, 0.1, 0, 0, 0, 0, 0, -1, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0.9, 0.1, 0, 0, 0, 0, 0, 0, -1]
    ]).transpose()

    # Define the B vector based on the model equations
    B = [
        khydro * XS,
        kaci * SF / (KF + SF) * Xaci,
        kace * SF / (KF + SF) * Xace,
        kMAac * Sac / (KMAac + Sac) * XMAac,
        kMAh * Sh / (KMAh + Sh) * XMAh,
        kSRBpro * Spro * SSO4 * XSRBpr / (KSRBpro + Spro) / (KSO4 + KSO4),
        kSRBac * Sac * SSO4 * XSRBac / (KSRBac + Sac) / (KSO4 + SSO4),
        kSRBh * Sh * SSO4 * XSRBh / (KSRBh + Sh) / (KSO4 + SSO4),
        kh2s * SH2S * Se,
        kdecaci * Xaci,
        kdecace * Xace,
        kdecMAac * XMAac,
        kdecMAh * XMAh,
        kdecSRBpro * XSRBpr,
        kdecSRBac * XSRBac,
        kdecSRBh * XSRBh      
    ]

    # Calculate the derivatives based on the A matrix and B vector
    return [np.dot(A[i, :], B) for i in range(17)]

# Load the datasets
influent_data = pd.read_csv("influent.csv")
position1_data = pd.read_csv("position1.csv")
position2_data = pd.read_csv("position2.csv")
position3_data = pd.read_csv("position3.csv")
position4_data = pd.read_csv("position4.csv")

# Ensure all datasets have the same number of samples
num_samples = len(influent_data)
assert len(position1_data) == num_samples, "Position1 data rows do not match influent data."
assert len(position2_data) == num_samples, "Position2 data rows do not match influent data."
assert len(position3_data) == num_samples, "Position3 data rows do not match influent data."
assert len(position4_data) == num_samples, "Position4 data rows do not match influent data."

# Define the reaction times corresponding to each position
reaction_times = [0.5, 1.0, 1.5, 2.0]

# Define the fitness function to evaluate the model's performance
def evaluate_model(individual):
    """
    Evaluate the model by calculating the total Mean Squared Error (MSE)
    between the model predictions and the observed data across all positions and samples.
    
    Parameters:
    - individual: A list containing the parameters [kHAC, kSRBAC, kMPBAC] to be optimized.
    
    Returns:
    - A tuple containing the total MSE.
    """
    kHAC, kSRBAC, kMPBAC = individual
    total_mse = 0.0

    for index in range(num_samples):
        # Get the initial conditions for the current sample from the influent data
        y0 = influent_data.iloc[index].values

        # Define the time span and evaluation points for the ODE solver
        t_span = (0, 2.0)
        t_eval = reaction_times

        # Solve the ODEs using the SewerX function with the current parameters
        try:
            sol = solve_ivp(
                SewerX, 
                t_span, 
                y0, 
                args=(kHAC, kSRBAC, kMPBAC), 
                t_eval=t_eval, 
                method='LSODA'
            )
        except Exception as e:
            # Assign a large error if the solver fails
            return (1e6,)

        if not sol.success:
            # Assign a large error if the solver did not succeed
            return (1e6,)

        # Transpose the solution to have each row correspond to a time point
        y = sol.y.T

        # Compare the model output with observed data for each reaction time
        for i, t in enumerate(reaction_times):
            # Get the model output at the current time point
            model_output = y[i, :]

            # Retrieve the corresponding observed data based on the reaction time
            if i == 0:
                observed = position1_data.iloc[index].values
            elif i == 1:
                observed = position2_data.iloc[index].values
            elif i == 2:
                observed = position3_data.iloc[index].values
            elif i == 3:
                observed = position4_data.iloc[index].values
            else:
                continue

            # Calculate the Mean Squared Error (MSE) between model output and observed data
            mse = np.mean((model_output - observed) ** 2)
            total_mse += mse

    return (total_mse,)

# Setup the Genetic Algorithm using DEAP
# Define the fitness as minimizing the total MSE
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
# Define an individual as a list with fitness attribute
creator.create("Individual", list, fitness=creator.FitnessMin)

# Initialize the toolbox
toolbox = base.Toolbox()

# Define the parameter ranges for kHAC, kSRBAC, and kMPBAC
kHAC_min, kHAC_max = 0.1, 20.0
kSRBAC_min, kSRBAC_max = 0.1, 20.0
kMPBAC_min, kMPBAC_max = 0.1, 20.0

# Register the attribute generators for each parameter
toolbox.register("attr_kHAC", random.uniform, kHAC_min, kHAC_max)
toolbox.register("attr_kSRBAC", random.uniform, kSRBAC_min, kSRBAC_max)
toolbox.register("attr_kMPBAC", random.uniform, kMPBAC_min, kMPBAC_max)

# Register the individual and population generators
toolbox.register("individual", tools.initCycle, creator.Individual, 
                 (toolbox.attr_kHAC, toolbox.attr_kSRBAC, toolbox.attr_kMPBAC), n=1)
toolbox.register("population", tools.initPopulation, list, toolbox.individual)

# Register the evaluation function
toolbox.register("evaluate", evaluate_model)

# Register the genetic operators
toolbox.register("mate", tools.cxBlend, alpha=0.5)
toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=1.0, indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=3)

def main():
    """
    Main function to execute the Genetic Algorithm for parameter optimization.
    Implements an early stopping mechanism based on lack of improvement.
    """
    # GA parameters
    population_size = 50
    max_generations = 100  # Set a high maximum number of generations
    crossover_prob = 0.7    # Probability of mating
    mutation_prob = 0.2     # Probability of mutation
    patience = 10           # Number of generations to wait for improvement
    improvement_threshold = 1e-6  # Minimum improvement to reset patience

    # Initialize the population
    pop = toolbox.population(n=population_size)

    # Set up multiprocessing pool for parallel evaluation
    pool = multiprocessing.Pool()
    toolbox.register("map", pool.map)

    # Initialize tracking variables for early stopping
    best_fitness = None
    no_improve_count = 0
    best_individual = None

    # Evolutionary loop
    for gen in range(max_generations):
        # Select the next generation individuals
        offspring = toolbox.select(pop, len(pop))
        # Clone the selected individuals
        offspring = list(map(toolbox.clone, offspring))

        # Apply crossover on the offspring
        for child1, child2 in zip(offspring[::2], offspring[1::2]):
            if random.random() < crossover_prob:
                toolbox.mate(child1, child2)
                # Invalidate fitness values after crossover
                del child1.fitness.values
                del child2.fitness.values

        # Apply mutation on the offspring
        for mutant in offspring:
            if random.random() < mutation_prob:
                toolbox.mutate(mutant)
                # Invalidate fitness values after mutation
                del mutant.fitness.values

        # Evaluate the individuals with invalid fitness
        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
        fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit

        # Replace the old population with the new offspring
        pop[:] = offspring

        # Gather all the fitnesses in the population
        fits = [ind.fitness.values[0] for ind in pop]
        min_fit = min(fits)
        avg_fit = np.mean(fits)
        max_fit = max(fits)

        # Print the statistics for the current generation
        print(f"Generation {gen+1}: Min Fitness = {min_fit}, Avg Fitness = {avg_fit}, Max Fitness = {max_fit}")

        # Check for improvement
        if best_fitness is None or best_fitness - min_fit > improvement_threshold:
            best_fitness = min_fit
            no_improve_count = 0
            # Update the best individual found so far
            best_individual = tools.selBest(pop, 1)[0]
        else:
            no_improve_count += 1
            print(f"No significant improvement. Patience count: {no_improve_count}/{patience}")

        # Check if early stopping condition is met
        if no_improve_count >= patience:
            print(f"No improvement in the last {patience} generations. Stopping early.")
            break

    # Close the multiprocessing pool
    pool.close()
    pool.join()

    # Select and print the best individual
    best_ind = tools.selBest(pop, 1)[0]
    print(f"Best Individual: {best_ind}")
    print(f"Best Fitness (Total MSE): {best_ind.fitness.values[0]}")

    # Save the best results to a text file
    with open("C:/Users/Van/Desktop/ga_results.txt", "w") as f:
        f.write(f"Best Individual: {best_ind}\n")
        f.write(f"Best Fitness (Total MSE): {best_ind.fitness.values[0]}\n")

if __name__ == "__main__":
    main()
