# Importing Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scienceplots


import sklearn
from sklearn.model_selection import *
from sklearn.metrics import *
from sklearn.preprocessing import *
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from scipy import stats

import tensorflow as tf
import keras_tuner


import missingno
import warnings
import gc

# Plotting Configuration

In [2]:
plt.rcdefaults()
mpl_global_config = {
    'figure.figsize': (7, 7),
    'figure.dpi': 1000,
    'font.size': 16,
    'axes.labelsize': 14,
    'axes.titlesize': 14,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'legend.fontsize': 8,
    'lines.linewidth': 2,
    'lines.markersize': 3,
    'grid.linewidth': 0.75,
    'savefig.dpi': 1000,
    'savefig.transparent': False,
    'savefig.bbox': 'tight',
    'pdf.compression': 9,
    'axes.axisbelow': True
}
plt.rcParams.update(mpl_global_config)
plt.style.use(['science', 'nature', 'high-contrast', "no-latex"])


colors = {
    "yellow": "#DDAA33",
    "red": "#BB5566",
    "blue": "#004488",
    "black": "#000000",
    "white": "#FFFFFF"
}

# Global Configuration Setting Controling Randomness, Trials, etc

In [3]:
sklearn.set_config(transform_output="pandas")
np.seterr(under='ignore')
warnings.filterwarnings('ignore')
SEED = 42
tf.keras.utils.set_random_seed(SEED)
n_trials = 50

# Reading Imputated and Outlier Reconstructed Data

In [4]:
# Concatenation train and test dataset after imputation and outlier reconstruction

# Load the CSV files
df1 = pd.read_csv('X_train_Imputed_ROutlier.csv', index_col=0, parse_dates=True)
df2 = pd.read_csv('X_test_Imputed_ROutlier.csv', index_col=0, parse_dates=True)

In [5]:
# Concatenate DataFrames while keeping the index
combined_df = pd.concat([df1, df2])

In [6]:
# Sort the combined DataFrame by index
sorted_combined_X = combined_df.sort_index()

# Save the sorted DataFrame to a new CSV file
sorted_combined_X.to_csv('sorted_combined_X_Imputed.csv', index=True)

In [7]:
Originaldf = pd.read_csv("OriginalData.csv",
                 dayfirst=True,
                 parse_dates=True,
                 index_col="Date")

Originaldf = Originaldf.dropna(subset=["Total Biogas Flowrate (m3/d)"])


y = Originaldf.pop("Total Biogas Flowrate (m3/d)")

In [8]:
# Merge the DataFrames along the columns
preprocessed_df = pd.concat([sorted_combined_X, y], axis=1)

In [None]:
preprocessed_df.shape

In [None]:
preprocessed_df.isnull().sum()

In [12]:
# Rename Columns by removing everything before and including the first double underscore
preprocessed_df.columns = [col.split('__', 1)[-1] for col in preprocessed_df.columns]

In [None]:
preprocessed_df

In [14]:
# Save the merged DataFrame to a new CSV file
preprocessed_df.to_csv('preprocessed_df.csv', index=True)

# Visualization after imputation and outlier reconstruction

In [None]:
# Visualize after imputation 
num_variables = len(preprocessed_df.columns)  # Number of variables to plot
num_rows = (num_variables + 1) // 2  # Calculate the number of rows needed

fig, axes = plt.subplots(num_rows, 2, figsize=(13, 3 * num_rows))  # Adjust the size as needed, now with 2 columns

for i, column in enumerate(preprocessed_df.columns):
    ax = axes[i // 2, i % 2]  # Determine the correct subplot
    ax.plot(preprocessed_df[column])
    ax.set_title(column)
    ax.set_xlabel('Index')
    ax.set_ylabel(column)
    ax.tick_params(axis='x', rotation=45)  # Rotate x-axis labels by 45 degrees

# Adjust spacing between plots
plt.subplots_adjust(hspace=0.5)  # Increase vertical space between rows
plt.savefig("ImputedData.svg", format='svg', dpi=1000, bbox_inches='tight')
plt.tight_layout()
plt.show()

# Data Denoising

In [15]:
# Data Denoising

# Apply moving average filter
window_size = 15
moving_avg = preprocessed_df.rolling(window_size).mean()

# Visualization after Denoising

In [None]:
num_variables = len(preprocessed_df.columns)  # Number of variables to plot
num_rows = (num_variables + 1) // 2  # Calculate the number of rows needed

fig, axes = plt.subplots(num_rows, 2, figsize=(13, 3 * num_rows))  # Adjust the size as needed, now with 2 columns

for i, column in enumerate(preprocessed_df.columns):
    ax = axes[i // 2, i % 2]  # Determine the correct subplot
    ax.plot(preprocessed_df[column], label='Original Data')
    ax.plot(moving_avg[column], label='Moving Average Denoised', linestyle='--')
    ax.set_title(column)
    ax.set_xlabel('Date')
    ax.set_ylabel(column)
    ax.tick_params(axis='x', rotation=45)  # Rotate x-axis labels by 45 degrees

# Adjust spacing between plots
plt.subplots_adjust(hspace=0.5)  # Increase vertical space between rows
plt.savefig("Denoised Data.svg", format='svg', dpi=300, bbox_inches='tight')

plt.tight_layout()
plt.show()


In [18]:
# Drop rows with NaN values
moving_avg_cleaned = moving_avg.dropna()

In [None]:
moving_avg_cleaned.isnull().sum()

In [None]:
moving_avg_cleaned

In [None]:
moving_avg_cleaned.describe()

In [21]:
# Save the merged DataFrame to a new CSV file
moving_avg_cleaned.to_csv('Denoised_df.csv', index=True)

# Create the Heatmaps

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


feature_names = moving_avg_cleaned.columns


# Set the figure size
plt.figure(figsize=(12, 10))

# Create the heatmap with adjusted parameters
sns.heatmap(
    moving_avg_cleaned.corr(method="pearson"),
    cmap="crest",
    annot=True,
    fmt=".2f",  
    linewidths=0.20,
    cbar_kws={"shrink": .8},  # Shrink the color bar
    annot_kws={"size": 10},  
    xticklabels=feature_names,
    yticklabels=feature_names
)


# Set the size of the x-tick labels
plt.xticks(fontsize=13)  
plt.yticks(fontsize=13)  


# Add title and labels
plt.title("Pearson's Rank Correlation Heatmap", fontsize=16)
plt.xlabel("Features/Target", fontsize=16)
plt.ylabel("Features/Target", fontsize=16)


plt.savefig("Pearson's Rank Correlation Heatmap.svg", format='svg', dpi=1000, bbox_inches='tight')

# Show the plot
plt.tight_layout()  # Adjust layout to prevent clipping of tick-labels
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import spearmanr  # Importing spearmanr for p-value calculation

# Calculate Spearman's rank correlation and p-values
corr_matrix, p_values = spearmanr(moving_avg_cleaned)

# Create a heatmap for the correlation coefficients
plt.figure(figsize=(12, 10))
sns.heatmap(
    corr_matrix,
    cmap="coolwarm",  # Color map for the heatmap: Blues PiYG coolwarm Pastel1 Greens
    annot=True,
    fmt=".2f",
    linewidths=0.20,
    cbar_kws={"shrink": .8},
    annot_kws={"size": 10},
    xticklabels=moving_avg_cleaned.columns,
    yticklabels=moving_avg_cleaned.columns
)

# Set the size of the x-tick labels
plt.xticks(fontsize=13)  
plt.yticks(fontsize=13)  

# Add title and labels
plt.title("Spearman's Rank Correlation Heatmap", fontsize=16)
plt.xlabel("Features/Target", fontsize=16)
plt.ylabel("Features/Target", fontsize=16)

plt.savefig("Spearman's Rank Correlation Heatmap.svg", format='svg', dpi=1000, bbox_inches='tight')

# Adjust layout
plt.tight_layout()
plt.show()

# Create a heatmap for the p-values
plt.figure(figsize=(12, 10))
sns.heatmap(
    p_values,
    cmap="coolwarm",  # Color map for the heatmap
    annot=True,
    fmt=".2f",
    linewidths=0.20,
    cbar_kws={"shrink": .8},
    annot_kws={"size": 10},
    xticklabels=moving_avg_cleaned.columns,
    yticklabels=moving_avg_cleaned.columns
)

# Set the size of the x-tick labels
plt.xticks(fontsize=13) 
plt.yticks(fontsize=13)  

# Add title and labels for p-values
plt.title("Spearman's Rank Correlation P-Values Heatmap", fontsize=16)
plt.xlabel("Features/Target", fontsize=16)
plt.ylabel("Features/Target", fontsize=16)

plt.savefig("Spearman's Rank Correlation P-Values Heatmap.svg", format='svg', dpi=1000, bbox_inches='tight')

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import spearmanr  

# Calculate Spearman's rank correlation and p-values
corr_matrix, p_values = spearmanr(moving_avg_cleaned)

# Create a heatmap for the correlation coefficients with significance annotations
plt.figure(figsize=(14, 12))  

# Create a mask for the annotations
annot = np.empty(corr_matrix.shape, dtype=object)

# Fill the annotation matrix with correlation coefficients and p-value significance
for i in range(corr_matrix.shape[0]):
    for j in range(corr_matrix.shape[1]):
        if i == j:
            annot[i, j] = f"{corr_matrix[i, j]:.2f}"  # Only show correlation coefficient on the diagonal
        else:
            significance = ''
            if p_values[i, j] < 0.01:
                significance = '**'
            elif p_values[i, j] < 0.05:
                significance = '*'
            annot[i, j] = f"{corr_matrix[i, j]:.2f}\n{significance}"  # Show correlation and significance on separate lines

# Create the heatmap
sns.heatmap(
    corr_matrix,
    cmap="coolwarm",  # Color map for the heatmap
    annot=annot,  # Use the annotated values
    fmt='',  # No formatting for annotations
    linewidths=0.20,
    cbar_kws={"shrink": .8},
    annot_kws={"size": 12},  
    xticklabels=moving_avg_cleaned.columns,
    yticklabels=moving_avg_cleaned.columns
)

# Improve label clarity
plt.xticks(fontsize=12, rotation=45, ha='right')  # Rotate x-tick labels for better visibility
plt.yticks(fontsize=12, rotation=0)  # Keep y-tick labels horizontal

# Set title and labels
plt.title("Spearman's Rank Correlation Heatmap with P-Value Significance", fontsize=18)
plt.xlabel("Features/Target", fontsize=16)
plt.ylabel("Features/Target", fontsize=16)

# Adjust layout
plt.tight_layout()
plt.savefig("Spearman_Correlation_Heatmap_with_PValues.svg", format='svg', dpi=1000, bbox_inches='tight')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.stats import spearmanr  

# Calculate Spearman's rank correlation and p-values
corr_matrix, p_values = spearmanr(moving_avg_cleaned)

# Create a mask for the upper triangle, including the diagonal
mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=0)  

# Create a heatmap for the correlation coefficients with significance annotations
plt.figure(figsize=(14, 12))  

# Create a mask for the annotations
annot = np.empty(corr_matrix.shape, dtype=object)

# Fill the annotation matrix with correlation coefficients and p-value significance
for i in range(corr_matrix.shape[0]):
    for j in range(corr_matrix.shape[1]):
        if i == j:
            annot[i, j] = f"{corr_matrix[i, j]:.2f}"  # Show correlation coefficient on the diagonal
        elif i > j:  # Only show values in the lower triangle
            significance = ''
            if p_values[i, j] < 0.01:
                significance = '**'
            elif p_values[i, j] < 0.05:
                significance = '*'
            annot[i, j] = f"{corr_matrix[i, j]:.2f}\n{significance}"  # Show correlation and significance on separate lines
        else:
            annot[i, j] = ''  # Leave upper triangle empty

# Create the heatmap
sns.heatmap(
    corr_matrix,
    mask=mask,  # Apply the mask to hide the upper triangle
    cmap="coolwarm",  # Color map for the heatmap
    annot=annot,  # Use the annotated values
    fmt='',  
    linewidths=0.20,
    cbar_kws={"shrink": .8},
    annot_kws={"size": 12},  
    xticklabels=moving_avg_cleaned.columns,
    yticklabels=moving_avg_cleaned.columns
)

# Improve label clarity
plt.xticks(fontsize=12, rotation=45, ha='right')  # Rotate x-tick labels for better visibility
plt.yticks(fontsize=12, rotation=0)  # Keep y-tick labels horizontal

# Set title and labels
plt.title("Spearman's Rank Correlation Heatmap (Lower Triangle with Diagonal)", fontsize=18)
plt.xlabel("Features/Target", fontsize=16)
plt.ylabel("Features/Target", fontsize=16)

# Adjust layout
plt.tight_layout()
plt.savefig("Spearman_Correlation_Heatmap_Lower_Triangle_with_Diagonal.svg", format='svg', dpi=1000, bbox_inches='tight')
plt.show()

In [28]:
# Identify Highly Correlated Features with P-Values
highly_correlated_features = []

# Calculate Spearman's rank correlation and p-values
corr_matrix, p_values = spearmanr(moving_avg_cleaned)

# Convert the correlation matrix to a DataFrame for easier access to columns
corr_df = pd.DataFrame(corr_matrix, index=moving_avg_cleaned.columns, columns=moving_avg_cleaned.columns)

# Iterate through the correlation matrix
for i in range(len(corr_df.columns)):
    for j in range(i + 1, len(corr_df.columns)):  # Adjusted to avoid duplicates
        if corr_df.iloc[i, j] > 0.60 or corr_df.iloc[i, j] < -0.60:  
            colname1 = corr_df.columns[i]
            colname2 = corr_df.columns[j]
            correlation_coefficient = corr_df.iloc[i, j]
            # Calculate p-value for the correlation
            _, p_value = spearmanr(moving_avg_cleaned[colname1], moving_avg_cleaned[colname2])
            highly_correlated_features.append((colname1, colname2, correlation_coefficient, p_value))

# Convert to DataFrame for better visualization
highly_correlated_df = pd.DataFrame(highly_correlated_features, columns=['Feature 1', 'Feature 2', 'Correlation Coefficient', 'P-Value'])

In [None]:
highly_correlated_df

In [30]:
# Save to a new CSV file
highly_correlated_df.to_csv('highly_correlated_df.csv', index=True)

# Results of Feature Selection

In [31]:
# Define the selected features to be removed
selected_features_to_be_removed = [
    'VS of Influent Primary Sludge (%)',
    'DS of Influent Waste Sludge (%)',
    "Influent Primary Sludge flowrate (m3/d)",
    "Influent Waste Sludge flowrate (m3/d)",
    "DS in Digesters (%)",
    "Fatty Acid (mg/L)",
    'DS of effluent Sludge (%)',
    'VS of effluent Sludge (%)',
]

# Keep the specified features and the target column
df_filtered = moving_avg_cleaned.drop(columns=selected_features_to_be_removed)  # Corrected line


In [None]:
df_filtered

# Save the Preprossessed DataFrame as df,csv

In [33]:
# Save the preprocessed dataFrame to a new CSV file
df_filtered.to_csv('df.csv', index=True)

In [None]:
df_filtered.isnull().sum()