In [None]:
import torch
import pandas as pd
import numpy as np
import random
from torch_geometric.nn import GATConv, global_mean_pool, global_sum_pool, global_max_pool, GNNExplainer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold, ParameterSampler
from torch_geometric.data import Data, DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# ============================
# 1. Setup and Configuration
# ============================

# Set random seeds for reproducibility
def set_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

set_seed(42)

# Define the hyperparameter search space
search_space = {
    'hidden_layer_sizes': [(64,), (128,), (64, 64), (128, 128), (64, 128, 64)],
    'learning_rate': [0.001, 0.01, 0.1],
    'num_epochs': [50, 100, 200],
    'batch_size': [16, 32, 64, 128],
    'dropout_rate': [0.0, 0.2, 0.5],
    'weight_decay': [0.0, 0.0001, 0.001, 0.01],
    'activation': ['relu', 'tanh', 'sigmoid'],
    'num_layers': [2, 3, 4],
    'aggregation_type': ['mean', 'sum', 'max'],
    'optimizer': ['adam', 'sgd', 'rmsprop'],
    'learning_rate_scheduler': ['constant', 'step', 'cosine']
}

# Define the number of random samples from the search space
n_iter = 100  # Adjust based on computational resources

# Generate random hyperparameter combinations
param_list = list(ParameterSampler(search_space, n_iter=n_iter, random_state=42))

# Number of folds for cross-validation
num_folds = 5

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# ============================
# 2. Data Loading and Processing
# ============================

# Read the main dataset file
data_path = 'path_to_your_data.csv'  # Replace with your actual file path
data_df = pd.read_csv(data_path).dropna()

# Convert DataFrame to tensor
data_tensor = torch.tensor(data_df.values, dtype=torch.float32)

# Define feature names and mapping (ensure these match your CSV columns)
features = ['ORP', 'V', 'DO', 'pH', 'SF', 'Spro', 'Sac', 'Sh', 'SSO4', 'SH2S', 'XS', 'SCH4']
node_mapping = {feat: idx for idx, feat in enumerate(features)}

# Define causal relationships (edges)
relationships = [
    ('ORP', 'SF'), ('ORP', 'Spro'), ('ORP', 'Sac'), ('ORP', 'Sh'),
    ('V', 'Sac'), ('V', 'Sh'), ('V', 'SSO4'), ('V', 'SH2S'), ('V', 'XS'), ('V', 'SCH4'),
    ('DO', 'SF'), ('DO', 'Spro'), ('DO', 'Sac'), ('DO', 'Sh'), ('DO', 'SH2S'), ('DO', 'XS'), ('DO', 'SCH4'),
    ('pH', 'SF'), ('pH', 'Spro'), ('pH', 'Sac'), ('pH', 'Sh'), ('pH', 'SSO4'), ('pH', 'XS'),
    ('SF', 'Spro'), ('SF', 'Sac'), ('Sac', 'SH2S'), ('SSO4', 'SH2S'), ('Sh', 'SH2S'),
    ('XS', 'SF'), ('SH2S', 'SCH4'), ('Sac', 'SCH4'), ('Sh', 'SCH4'), ('SF', 'Sh')
]

# Generate edge_index tensor
edge_index = torch.tensor([[node_mapping[src], node_mapping[dst]] for src, dst in relationships], dtype=torch.long).t().contiguous()

# ============================
# 3. Model Definition
# ============================

class FlexibleGNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_layers, dropout_rate, activation, num_layers, aggregation_type):
        super(FlexibleGNN, self).__init__()
        self.num_layers = num_layers
        self.aggregation_type = aggregation_type
        self.activation = activation
        self.dropout_rate = dropout_rate

        # Activation function
        if activation == 'relu':
            self.activation_fn = torch.relu
        elif activation == 'tanh':
            self.activation_fn = torch.tanh
        elif activation == 'sigmoid':
            self.activation_fn = torch.sigmoid
        else:
            raise ValueError(f"Unsupported activation: {activation}")

        # Aggregation function
        if aggregation_type == 'mean':
            self.agg_fn = global_mean_pool
        elif aggregation_type == 'sum':
            self.agg_fn = global_sum_pool
        elif aggregation_type == 'max':
            self.agg_fn = global_max_pool
        else:
            raise ValueError(f"Unsupported aggregation type: {aggregation_type}")

        # Define GAT layers
        self.gat_layers = torch.nn.ModuleList()
        prev_dim = input_dim
        for i in range(num_layers):
            out_dim = hidden_layers[i] if i < len(hidden_layers) else hidden_layers[-1]
            heads = 8  # You can make this a hyperparameter if desired
            concat = True if i < num_layers - 1 else False  # Don't concatenate in the last layer
            self.gat_layers.append(GATConv(prev_dim, out_dim, heads=heads, concat=concat, dropout=dropout_rate))
            prev_dim = out_dim * heads if concat else out_dim

        # Define a fully connected layer for regression
        self.fc = torch.nn.Linear(prev_dim, 1)

        # Dropout layer
        self.dropout = torch.nn.Dropout(p=dropout_rate)

    def forward(self, x, edge_index, batch):
        for gat in self.gat_layers:
            x = gat(x, edge_index)
            x = self.activation_fn(x)
            x = self.dropout(x)
        x = self.agg_fn(x, batch)
        x = self.fc(x)
        return x

# ============================
# 4. Dataset Preparation
# ============================

class GraphDataset(torch.utils.data.Dataset):
    def __init__(self, data_tensor, edge_index, target_col=-1):
        super(GraphDataset, self).__init__()
        self.x = data_tensor[:, :-1]
        self.y = data_tensor[:, target_col].unsqueeze(1)
        self.edge_index = edge_index

    def __len__(self):
        return 1  # Single graph

    def __getitem__(self, idx):
        data = Data(x=self.x, edge_index=self.edge_index, y=self.y)
        return data

dataset = GraphDataset(data_tensor, edge_index)

# ============================
# 5. Evaluation Function
# ============================

def evaluate(model, loader, criterion, device):
    model.eval()
    preds = []
    targets = []
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            out = model(data.x, data.edge_index, data.batch)
            preds.append(out.cpu().numpy())
            targets.append(data.y.cpu().numpy())
    preds = np.vstack(preds)
    targets = np.vstack(targets)
    mae = mean_absolute_error(targets, preds)
    mse = mean_squared_error(targets, preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(targets, preds)
    return {'mae': mae, 'mse': mse, 'rmse': rmse, 'r2': r2}

# ============================
# 6. Cross-Validation and Hyperparameter Search
# ============================

# Initialize KFold
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Initialize variables to store the best results
best_result = None
best_val_r2 = -np.inf

# Initialize a list to store all results
all_results = []

print("Starting randomized search with five-fold cross-validation...\n")
for idx, params in enumerate(tqdm(param_list, desc="Hyperparameter combinations")):
    fold_metrics = {
        'train_mae': [],
        'train_mse': [],
        'train_rmse': [],
        'train_r2': [],
        'val_mae': [],
        'val_mse': [],
        'val_rmse': [],
        'val_r2': []
    }
    
    # Convert the entire dataset to a NumPy array for indexing
    X = data_tensor[:, :-1].numpy()
    y = data_tensor[:, -1].numpy()
    
    # Perform K-Fold cross-validation
    for fold, (train_idx_cv, val_idx_cv) in enumerate(kf.split(X)):
        # Split data
        train_x = torch.tensor(X[train_idx_cv], dtype=torch.float32)
        train_y = torch.tensor(y[train_idx_cv], dtype=torch.float32).unsqueeze(1)
        val_x = torch.tensor(X[val_idx_cv], dtype=torch.float32)
        val_y = torch.tensor(y[val_idx_cv], dtype=torch.float32).unsqueeze(1)
        
        # Create masks
        num_nodes = data_tensor.size(0)
        train_mask = torch.zeros(num_nodes, dtype=torch.bool)
        val_mask = torch.zeros(num_nodes, dtype=torch.bool)
        train_mask[train_idx_cv] = True
        val_mask[val_idx_cv] = True
        
        # Create a single Data object with masks
        data = Data(x=data_tensor[:, :-1], edge_index=edge_index, y=data_tensor[:, -1].unsqueeze(1))
        data.train_mask = train_mask
        data.val_mask = val_mask
        data = data.to(device)
        
        # Create DataLoader for the entire graph
        loader = DataLoader([data], batch_size=1, shuffle=False)
        
        # Initialize the model
        model = FlexibleGNN(
            input_dim=X.shape[1],
            hidden_layers=params['hidden_layer_sizes'],
            dropout_rate=params['dropout_rate'],
            activation=params['activation'],
            num_layers=params['num_layers'],
            aggregation_type=params['aggregation_type']
        ).to(device)
        
        # Define optimizer
        if params['optimizer'] == 'adam':
            optimizer = torch.optim.Adam(model.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])
        elif params['optimizer'] == 'sgd':
            optimizer = torch.optim.SGD(model.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])
        elif params['optimizer'] == 'rmsprop':
            optimizer = torch.optim.RMSprop(model.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])
        else:
            raise ValueError(f"Unsupported optimizer: {params['optimizer']}")
        
        # Define learning rate scheduler
        if params['learning_rate_scheduler'] == 'constant':
            scheduler = None
        elif params['learning_rate_scheduler'] == 'step':
            scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1)
        elif params['learning_rate_scheduler'] == 'cosine':
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=params['num_epochs'])
        else:
            raise ValueError(f"Unsupported learning rate scheduler: {params['learning_rate_scheduler']}")
        
        # Define loss function
        criterion = torch.nn.MSELoss()
        
        # Training loop
        model.train()
        for epoch in range(params['num_epochs']):
            optimizer.zero_grad()
            out = model(data.x, data.edge_index, torch.zeros(data.x.size(0), dtype=torch.long).to(device))  # Batch can be zeros since it's a single graph
            loss = criterion(out[data.train_mask], data.y[data.train_mask])
            loss.backward()
            optimizer.step()
            if scheduler:
                scheduler.step()
        
        # Evaluate on training fold
        train_preds = out[data.train_mask].detach().cpu().numpy()
        train_targets = data.y[data.train_mask].detach().cpu().numpy()
        train_mae = mean_absolute_error(train_targets, train_preds)
        train_mse = mean_squared_error(train_targets, train_preds)
        train_rmse = np.sqrt(train_mse)
        train_r2 = r2_score(train_targets, train_preds)
        fold_metrics['train_mae'].append(train_mae)
        fold_metrics['train_mse'].append(train_mse)
        fold_metrics['train_rmse'].append(train_rmse)
        fold_metrics['train_r2'].append(train_r2)
        
        # Evaluate on validation fold
        val_preds = out[data.val_mask].detach().cpu().numpy()
        val_targets = data.y[data.val_mask].detach().cpu().numpy()
        val_mae = mean_absolute_error(val_targets, val_preds)
        val_mse = mean_squared_error(val_targets, val_preds)
        val_rmse = np.sqrt(val_mse)
        val_r2 = r2_score(val_targets, val_preds)
        fold_metrics['val_mae'].append(val_mae)
        fold_metrics['val_mse'].append(val_mse)
        fold_metrics['val_rmse'].append(val_rmse)
        fold_metrics['val_r2'].append(val_r2)
    
    # Aggregate metrics across folds
    avg_metrics = {metric: np.mean(values) for metric, values in fold_metrics.items()}
    avg_metrics['params'] = params
    all_results.append(avg_metrics)
    
    # Update best result based on validation R2
    if avg_metrics['val_r2'] > best_val_r2:
        best_val_r2 = avg_metrics['val_r2']
        best_result = avg_metrics

# ============================
# 7. Results Analysis
# ============================

# Convert all results to DataFrame
results_df = pd.DataFrame(all_results)

# Display top 10 hyperparameter combinations based on validation R²
top_results = results_df.sort_values(by='val_r2', ascending=False).head(10)
print("\nTop hyperparameter combinations based on average validation R²:")
print(top_results[['params', 'val_r2']])

# Save all results to Excel for further analysis
results_df.to_excel('EcoGNN_Model_AllResults.xlsx', index=False)
print("\nAll hyperparameter search results saved to 'EcoGNN_Model_AllResults.xlsx'")

# Extract best hyperparameters
best_params = best_result['params']
print(f"\nBest Hyperparameters: {best_params}")

# ============================
# 8. Final Model Training (Optional)
# ============================

# Initialize the final model with the best hyperparameters
final_model = FlexibleGNN(
    input_dim=data_tensor.size(1) - 1,  # Number of features
    hidden_layers=best_params['hidden_layer_sizes'],
    dropout_rate=best_params['dropout_rate'],
    activation=best_params['activation'],
    num_layers=best_params['num_layers'],
    aggregation_type=best_params['aggregation_type']
).to(device)

# Define optimizer
if best_params['optimizer'] == 'adam':
    optimizer = torch.optim.Adam(final_model.parameters(), lr=best_params['learning_rate'], weight_decay=best_params['weight_decay'])
elif best_params['optimizer'] == 'sgd':
    optimizer = torch.optim.SGD(final_model.parameters(), lr=best_params['learning_rate'], weight_decay=best_params['weight_decay'])
elif best_params['optimizer'] == 'rmsprop':
    optimizer = torch.optim.RMSprop(final_model.parameters(), lr=best_params['learning_rate'], weight_decay=best_params['weight_decay'])
else:
    raise ValueError(f"Unsupported optimizer: {best_params['optimizer']}")

# Define learning rate scheduler
if best_params['learning_rate_scheduler'] == 'constant':
    scheduler = None
elif best_params['learning_rate_scheduler'] == 'step':
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1)
elif best_params['learning_rate_scheduler'] == 'cosine':
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=best_params['num_epochs'])
else:
    raise ValueError(f"Unsupported learning rate scheduler: {best_params['learning_rate_scheduler']}")

# Define loss function
criterion = torch.nn.MSELoss()

# Create a single Data object for the entire dataset
full_data = Data(x=data_tensor[:, :-1], edge_index=edge_index, y=data_tensor[:, -1].unsqueeze(1)).to(device)

# Create DataLoader
final_loader = DataLoader([full_data], batch_size=1, shuffle=True)

# Training loop for the final model
print("\nTraining the final model with the best hyperparameters...\n")
final_model.train()
for epoch in range(best_params['num_epochs']):
    optimizer.zero_grad()
    out = final_model(full_data.x, full_data.edge_index, torch.zeros(full_data.x.size(0), dtype=torch.long).to(device))  # Batch can be zeros since it's a single graph
    loss = criterion(out, full_data.y)
    loss.backward()
    optimizer.step()
    if scheduler:
        scheduler.step()
    if (epoch + 1) % 10 == 0 or epoch == 0:
        print(f"Epoch {epoch+1}/{best_params['num_epochs']}, Loss: {loss.item():.4f}")

# ============================
# 9. Final Evaluation
# ============================

# Evaluate the final model on the entire dataset
final_model.eval()
with torch.no_grad():
    out = final_model(full_data.x, full_data.edge_index, torch.zeros(full_data.x.size(0), dtype=torch.long).to(device))
    preds = out.cpu().numpy()
    targets = full_data.y.cpu().numpy()
    final_mae = mean_absolute_error(targets, preds)
    final_mse = mean_squared_error(targets, preds)
    final_rmse = np.sqrt(final_mse)
    final_r2 = r2_score(targets, preds)

print("\nFinal Model Performance on Entire Dataset:")
print(f"MAE: {final_mae:.4f}, MSE: {final_mse:.4f}, RMSE: {final_rmse:.4f}, R²: {final_r2:.4f}")

# ============================
# 10. Save Best Hyperparameters and Results
# ============================

# Save the best hyperparameters and performance metrics
results_to_save = {
    'Best Params': [best_params],
    'Final MAE': [final_mae],
    'Final MSE': [final_mse],
    'Final RMSE': [final_rmse],
    'Final R²': [final_r2]
}

# Convert to DataFrame for saving
results_summary = pd.DataFrame(results_to_save)

# Save to Excel
results_summary.to_excel('EcoGNN_Model_BestResults.xlsx', index=False)
print("\nBest model results saved to 'EcoGNN_Model_BestResults.xlsx'")

# ============================
# 11. Explainability Analysis with Additional Dataset
# ============================

# Function to load and preprocess additional data
def load_additional_data(additional_data_path, features, target_col, scaler=None):
    # Read the additional dataset
    additional_df = pd.read_csv(additional_data_path).dropna()
    
    # Check if target column exists
    if target_col not in additional_df.columns:
        raise ValueError(f"Target column '{target_col}' not found in the additional CSV.")
    
    # Ensure that the remaining columns match the features list
    if not all(feature in additional_df.columns for feature in features):
        missing = set(features) - set(additional_df.columns)
        raise ValueError(f"The following features are missing from the additional CSV: {missing}")
    
    # Rearrange columns to match the features list followed by the target
    additional_df = additional_df[features + [target_col]]
    
    # Perform feature scaling using the scaler fitted on the main dataset
    if scaler:
        additional_df[features] = scaler.transform(additional_df[features])
    else:
        # If no scaler is provided, fit a new one
        scaler = StandardScaler()
        additional_df[features] = scaler.fit_transform(additional_df[features])
    
    # Convert DataFrame to tensor
    additional_data_tensor = torch.tensor(additional_df.values, dtype=torch.float32)
    
    return additional_data_tensor, scaler

# Define the path to the additional dataset
additional_data_path = 'path_to_additional_data.csv'  # Replace with your actual additional data file path

# Load and preprocess additional dataset
# Assuming 'SCH4' is the target column in the additional dataset
additional_data_tensor, _ = load_additional_data(additional_data_path, features, 'SCH4', scaler=scaler)

# Create a Data object for the additional dataset
additional_data = Data(x=additional_data_tensor[:, :-1], 
                      edge_index=edge_index, 
                      y=additional_data_tensor[:, -1].unsqueeze(1)).to(device)

# Initialize GNNExplainer with the final_model
explainer = GNNExplainer(final_model, epochs=200)

# Initialize arrays to store importance scores
all_edge_importances_additional = []
all_feature_importances_additional = []

# Number of nodes to explain in the additional dataset
num_nodes_additional = additional_data.num_nodes  # Explain all nodes

print("\nStarting explainability analysis on the additional dataset using GNNExplainer...\n")
for node_idx in tqdm(range(num_nodes_additional), desc="Explaining nodes in additional dataset"):
    # Explain the node
    node_feat_mask, edge_mask = explainer.explain_node(node_idx, additional_data.x, additional_data.edge_index)
    
    # Convert masks to numpy arrays
    edge_mask = edge_mask.detach().cpu().numpy()
    node_feat_mask = node_feat_mask.detach().cpu().numpy()
    
    # Handle NaN values by replacing them with 0
    edge_mask = np.nan_to_num(edge_mask)
    node_feat_mask = np.nan_to_num(node_feat_mask)
    
    # Normalize the masks to [0, 1]
    if np.max(edge_mask) > 0:
        edge_mask_normalized = edge_mask / np.max(edge_mask)
    else:
        edge_mask_normalized = edge_mask  # All zeros
    
    if np.max(node_feat_mask) > 0:
        node_feat_mask_normalized = node_feat_mask / np.max(node_feat_mask)
    else:
        node_feat_mask_normalized = node_feat_mask  # All zeros
    
    # Append the normalized masks to the lists
    all_edge_importances_additional.append(edge_mask_normalized)
    all_feature_importances_additional.append(node_feat_mask_normalized)

# Convert lists to numpy arrays for aggregation
edge_importances_array_additional = np.array(all_edge_importances_additional)
feature_importances_array_additional = np.array(all_feature_importances_additional)

# Aggregate importance scores by averaging across all explained nodes
mean_edge_importance_additional = edge_importances_array_additional.mean(axis=0)
mean_feature_importance_additional = feature_importances_array_additional.mean(axis=0)

# Create DataFrames for easy visualization and saving
edge_importance_df_additional = pd.DataFrame({
    'Source': [src for src, _ in relationships],
    'Target': [dst for _, dst in relationships],
    'Importance': mean_edge_importance_additional
})

feature_importance_df_additional = pd.DataFrame({
    'Feature': features,
    'Importance': mean_feature_importance_additional
})

# ============================
# 12. Save Explainability Analysis Results
# ============================

# Save the explainability results to Excel
with pd.ExcelWriter('EcoGNN_Explainability.xlsx') as writer:
    edge_importance_df_additional.to_excel(writer, sheet_name='Edge_Importances_Additional', index=False)
    feature_importance_df_additional.to_excel(writer, sheet_name='Feature_Importances_Additional', index=False)

print("\nExplainability analysis results on the additional dataset saved to 'EcoGNN_Explainability.xlsx'")