In [None]:
# ============================
# 1. Import Necessary Libraries
# ============================

import pandas as pd
import numpy as np
import random
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib  # For saving models
from tqdm import tqdm  # For progress bars

# ============================
# 2. Set Random Seed for Reproducibility
# ============================

def set_seed(seed):
    """
    Sets the random seed for NumPy and Python's random module to ensure reproducibility.
    """
    np.random.seed(seed)
    random.seed(seed)

set_seed(42)  # You can choose any seed value you prefer

# ============================
# 3. Load and Inspect Data
# ============================

# Define the path to your CSV data file
data_path = 'path_to_your_data.csv'  # <-- Replace with your actual file path

# Load the dataset into a Pandas DataFrame
try:
    data_df = pd.read_csv(data_path)
    print("Dataset loaded successfully.\n")
except FileNotFoundError:
    print(f"File not found at the specified path: {data_path}")
    exit()

# Display basic information about the dataset
print("Dataset Information:")
print(data_df.info())

# Display the first five rows of the dataset
print("\nFirst 5 Rows of the Dataset:")
print(data_df.head())

# ============================
# 4. Data Preprocessing
# ============================

# Assume the last column is the target variable (adjust if necessary)
feature_cols = data_df.columns[:-1]
target_col = data_df.columns[-1]

# Extract features and target variable as NumPy arrays
X = data_df[feature_cols].values
y = data_df[target_col].values

# Check for missing values in features and target
if np.isnan(X).any() or np.isnan(y).any():
    print("\nMissing values detected. Performing imputation...")
    # Replace NaNs with the mean of each column
    X = np.nan_to_num(X, nan=np.nanmean(X))
    y = np.nan_to_num(y, nan=np.nanmean(y))
    print("Missing values imputed with column means.")
else:
    print("\nNo missing values detected.")

# ============================
# 5. Define Pipeline and Hyperparameter Grid
# ============================

# Create a pipeline that first scales the data and then applies MLPRegressor
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('mlpregressor', MLPRegressor(random_state=42, max_iter=1000))
])

# Define the hyperparameter grid for GridSearchCV
# Since some hyperparameters are only applicable to certain solvers,
# we define separate parameter grids for each solver
param_grid = [
    {
        'mlpregressor__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100), (50, 100, 50)],
        'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'mlpregressor__solver': ['lbfgs'],
        'mlpregressor__alpha': [0.0001, 0.001, 0.01, 0.1],
        'mlpregressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
        'mlpregressor__learning_rate_init': [0.001, 0.01, 0.1],
        'mlpregressor__max_iter': [200, 300, 500],
        'mlpregressor__batch_size': [32, 64, 128, 256],
        'mlpregressor__early_stopping': [True, False],
        'mlpregressor__validation_fraction': [0.1, 0.2, 0.3]
        # Note: 'momentum', 'beta_1', 'beta_2', 'epsilon' are not applicable to 'lbfgs'
    },
    {
        'mlpregressor__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100), (50, 100, 50)],
        'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'mlpregressor__solver': ['sgd'],
        'mlpregressor__alpha': [0.0001, 0.001, 0.01, 0.1],
        'mlpregressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
        'mlpregressor__learning_rate_init': [0.001, 0.01, 0.1],
        'mlpregressor__max_iter': [200, 300, 500],
        'mlpregressor__batch_size': [32, 64, 128, 256],
        'mlpregressor__momentum': [0.0, 0.5, 0.9],
        'mlpregressor__early_stopping': [True, False],
        'mlpregressor__validation_fraction': [0.1, 0.2, 0.3]
        # 'beta_1', 'beta_2', 'epsilon' are not applicable to 'sgd'
    },
    {
        'mlpregressor__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100), (50, 100, 50)],
        'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'mlpregressor__solver': ['adam'],
        'mlpregressor__alpha': [0.0001, 0.001, 0.01, 0.1],
        'mlpregressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
        'mlpregressor__learning_rate_init': [0.001, 0.01, 0.1],
        'mlpregressor__max_iter': [200, 300, 500],
        'mlpregressor__batch_size': [32, 64, 128, 256],
        'mlpregressor__beta_1': [0.9, 0.95, 0.99],
        'mlpregressor__beta_2': [0.999, 0.995, 0.99],
        'mlpregressor__epsilon': [1e-8, 1e-7, 1e-6],
        'mlpregressor__early_stopping': [True, False],
        'mlpregressor__validation_fraction': [0.1, 0.2, 0.3]
        # 'momentum' is not applicable to 'adam'
    }
]

# ============================
# 6. Five-Fold Cross-Validation and Grid Search
# ============================

# Initialize K-Fold cross-validation with 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize GridSearchCV with the pipeline and hyperparameter grid
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='r2',          # Using R² as the evaluation metric
    cv=kf,
    n_jobs=-1,             # Utilize all available CPU cores
    verbose=2              # Verbosity level: 0, 1, or 2
)

print("Starting five-fold cross-validation and grid search...\n")

# Fit GridSearchCV to the data
grid_search.fit(X, y)

print("\nGrid search completed.")

# ============================
# 7. Results Analysis and Saving
# ============================

# Extract all grid search results into a DataFrame
results_df = pd.DataFrame(grid_search.cv_results_)

# Select relevant columns for clarity
selected_columns = [
    'param_mlpregressor__hidden_layer_sizes',
    'param_mlpregressor__activation',
    'param_mlpregressor__solver',
    'param_mlpregressor__alpha',
    'param_mlpregressor__learning_rate',
    'param_mlpregressor__learning_rate_init',
    'param_mlpregressor__max_iter',
    'param_mlpregressor__batch_size',
    'param_mlpregressor__momentum',
    'param_mlpregressor__beta_1',
    'param_mlpregressor__beta_2',
    'param_mlpregressor__epsilon',
    'param_mlpregressor__early_stopping',
    'param_mlpregressor__validation_fraction',
    'mean_test_score',
    'std_test_score',
    'mean_train_score',
    'std_train_score'
]
results_selected = results_df[selected_columns]

# Rename columns for better readability
results_selected = results_selected.rename(columns={
    'param_mlpregressor__hidden_layer_sizes': 'hidden_layer_sizes',
    'param_mlpregressor__activation': 'activation',
    'param_mlpregressor__solver': 'solver',
    'param_mlpregressor__alpha': 'alpha',
    'param_mlpregressor__learning_rate': 'learning_rate',
    'param_mlpregressor__learning_rate_init': 'learning_rate_init',
    'param_mlpregressor__max_iter': 'max_iter',
    'param_mlpregressor__batch_size': 'batch_size',
    'param_mlpregressor__momentum': 'momentum',
    'param_mlpregressor__beta_1': 'beta_1',
    'param_mlpregressor__beta_2': 'beta_2',
    'param_mlpregressor__epsilon': 'epsilon',
    'param_mlpregressor__early_stopping': 'early_stopping',
    'param_mlpregressor__validation_fraction': 'validation_fraction',
    'mean_test_score': 'mean_val_r2',
    'std_test_score': 'std_val_r2',
    'mean_train_score': 'mean_train_r2',
    'std_train_score': 'std_train_r2'
})

# Sort the results by mean validation R² in descending order and select top 10
top_results = results_selected.sort_values(by='mean_val_r2', ascending=False).head(10)
print("\nTop 10 Hyperparameter Combinations Based on Average Validation R²:")
print(top_results)

# Save all grid search results to an Excel file for further analysis
results_selected.to_excel('ANN_Model_AllResults.xlsx', index=False)
print("\nAll hyperparameter search results have been saved to 'ANN_Model_AllResults.xlsx'.")

# ============================
# 8. Extract Best Hyperparameter Combination
# ============================

# Get the best hyperparameters
best_params = grid_search.best_params_
print(f"\nBest Hyperparameter Combination:")
print(best_params)

# Get the best validation R² score
best_score = grid_search.best_score_
print(f"Best Validation R² Score: {best_score:.4f}")

# ============================
# 9. Train Final Model with Best Hyperparameters
# ============================

# Initialize the final model with the best hyperparameters
final_pipeline = grid_search.best_estimator_

# Train the final model on the entire dataset
final_pipeline.fit(X, y)

# Make predictions on the entire dataset
y_pred = final_pipeline.predict(X)

# Calculate performance metrics
final_mae = mean_absolute_error(y, y_pred)
final_mse = mean_squared_error(y, y_pred)
final_rmse = np.sqrt(final_mse)
final_r2 = r2_score(y, y_pred)

print("\nFinal Model Performance on the Entire Dataset:")
print(f"MAE: {final_mae:.4f}")
print(f"MSE: {final_mse:.4f}")
print(f"RMSE: {final_rmse:.4f}")
print(f"R²: {final_r2:.4f}")

# ============================
# 10. Save the Best Model and Scaler
# ============================

# Save the pipeline (which includes both scaler and model) using joblib
model_filename = 'ANN_FinalModel_Pipeline.pkl'
joblib.dump(final_pipeline, model_filename)
print(f"\nFinal model pipeline (including scaler) has been saved as '{model_filename}'.")

# ============================
# 11. Save Best Hyperparameters and Final Results
# ============================

# Create a dictionary with the best hyperparameters and final performance metrics
results_to_save = {
    'Best Params': [best_params],
    'Final MAE': [final_mae],
    'Final MSE': [final_mse],
    'Final RMSE': [final_rmse],
    'Final R²': [final_r2]
}

# Convert the dictionary to a DataFrame
results_summary = pd.DataFrame(results_to_save)

# Save the summary to an Excel file
summary_filename = 'ANN_Model_BestResults.xlsx'
results_summary.to_excel(summary_filename, index=False)
print(f"\nBest model results have been saved to '{summary_filename}'.")

# ============================
# 12. Conclusion
# ============================

print("\nScript execution completed successfully.")
