In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the excel file
data = pd.read_excel("C:/Users/Administrator/OneDrive/Desktop/THESIS/HYDRANAUTICS PROJECTION/20,25,28,30,36,40,45 TDS.xlsx")

# Define Input (X) and Output (Y) variables
X = data[['Feed Flow (m3/hr)', 'Feed Pressure(bar)', 'Feed Temperature', 'Feed water pH', 'Specific Energy(kwh/m3)', 'Flux(lmh)', 
          'Ca_FW', 'Mg_FW', 'Na_FW', 'K_FW', 'NH4_FW', 'Ba_FW', 'Sr_FW', 'H_FW', 'CO3_FW', 'HCO3_FW', 'SO4_FW', 'Cl_FW', 'F_FW',
          'NO3_FW', 'PO4_FW', 'OH_FW', 'SiO2_FW', 'B_FW', 'CO2_FW', 'NH3_FW', 'Feed Water TDS']]

Y = data[['Ca_P', 'Mg_P', 'Na_P', 'K_P', 'NH4_P', 'Ba_P', 'Sr_P', 'H_P', 'CO3_P', 'HCO3_P', 'SO4_P', 'Cl_P', 'F_P',
          'NO3_P', 'PO4_P', 'OH_P', 'SiO2_P', 'B_P', 'CO2_P', 'NH3_P', 'Permeate TDS']]

# Remove columns with zero variance (columns with only one unique value)
low_variance_columns = [col for col in Y.columns if Y[col].nunique() <= 1]
Y = Y.drop(columns=low_variance_columns)

# Train-Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Scale Input Data
x_scaler = StandardScaler()
X_train_scaled = x_scaler.fit_transform(X_train)
X_test_scaled = x_scaler.transform(X_test)

# Scale Output Data (Fixes Negative R²)
y_scaler = StandardScaler()
Y_train_scaled = y_scaler.fit_transform(Y_train)
Y_test_scaled = y_scaler.transform(Y_test)

# Train KNN Model
knn_model = KNeighborsRegressor(n_neighbors=5, metric='euclidean')  
knn_model.fit(X_train_scaled, Y_train_scaled)

# Predict
Y_pred_scaled = knn_model.predict(X_test_scaled)

# Convert predictions back to original scale
Y_pred = y_scaler.inverse_transform(Y_pred_scaled)

# Compute Metrics
mae = mean_absolute_error(Y_test, Y_pred)
mse = mean_squared_error(Y_test, Y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(Y_test, Y_pred)  

# Compute R² & MAE for Each Output Parameter
r2_scores = {col: r2_score(Y_test[col], Y_pred[:, i]) for i, col in enumerate(Y_test.columns)}
mae_scores = {col: mean_absolute_error(Y_test[col], Y_pred[:, i]) for i, col in enumerate(Y_test.columns)}

# Print Results
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Overall R² Score: {r2:.4f}")

print("\nR² Scores for Each Output Parameter:")
for col, r2_value in r2_scores.items():
    print(f"{col}: {r2_value:.4f}")

print("\nMean Absolute Error for Each Output Parameter:")
for col, mae_value in mae_scores.items():
    print(f"{col}: {mae_value:.4f}")

# Predict for Row 390
row_390 = data.iloc[[389]]  
X_row_390 = row_390[X.columns]
X_row_390_scaled = x_scaler.transform(X_row_390)

predicted_row_390_scaled = knn_model.predict(X_row_390_scaled)
predicted_row_390 = y_scaler.inverse_transform(predicted_row_390_scaled)

# Compare Actual vs Predicted
print("\nActual vs Predicted for Row 390:")
for i, col in enumerate(Y_test.columns):
    print(f"{col}: Actual = {row_390[col].values[0]:.4f}, Predicted = {predicted_row_390[0][i]:.4f}")


Mean Absolute Error (MAE): 0.9264
Mean Squared Error (MSE): 6.5873
Root Mean Squared Error (RMSE): 2.5666
Overall R² Score: 0.9314

R² Scores for Each Output Parameter:
Ca_P: 0.8824
Mg_P: 0.9749
Na_P: 0.9879
K_P: 0.6236
H_P: 0.9585
HCO3_P: 0.9562
SO4_P: 0.9905
Cl_P: 0.9916
SiO2_P: 0.8895
CO2_P: 1.0000
Permeate TDS: 0.9901

Mean Absolute Error for Each Output Parameter:
Ca_P: 0.0013
Mg_P: 0.0037
Na_P: 2.1092
K_P: 0.1314
H_P: 0.0000
HCO3_P: 0.1301
SO4_P: 0.0876
Cl_P: 2.7832
SiO2_P: 0.0008
CO2_P: 0.0000
Permeate TDS: 4.9429

Actual vs Predicted for Row 390:
Ca_P: Actual = 0.0590, Predicted = 0.0582
Mg_P: Actual = 0.2790, Predicted = 0.2752
Na_P: Actual = 151.3170, Predicted = 149.0894
K_P: Actual = 5.7460, Predicted = 5.6616
H_P: Actual = 0.0020, Predicted = 0.0020
HCO3_P: Actual = 8.0510, Predicted = 7.9328
SO4_P: Actual = 5.8530, Predicted = 5.7662
Cl_P: Actual = 230.3430, Predicted = 226.9520
SiO2_P: Actual = 0.0470, Predicted = 0.0460
CO2_P: Actual = 24.1100, Predicted = 24.1100
Perme

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the excel file
data = pd.read_excel("C:/Users/Administrator/OneDrive/Desktop/THESIS/HYDRANAUTICS PROJECTION/20,25,28,30,36,40,45 TDS.xlsx")

# Define Input (X) and Output (Y) variables
X = data[['Feed Flow (m3/hr)', 'Feed Pressure(bar)', 'Feed Temperature', 'Feed water pH', 'Specific Energy(kwh/m3)', 'Flux(lmh)', 
          'Ca_FW', 'Mg_FW', 'Na_FW', 'K_FW', 'NH4_FW', 'Ba_FW', 'Sr_FW', 'H_FW', 'CO3_FW', 'HCO3_FW', 'SO4_FW', 'Cl_FW', 'F_FW',
          'NO3_FW', 'PO4_FW', 'OH_FW', 'SiO2_FW', 'B_FW', 'CO2_FW', 'NH3_FW', 'Feed Water TDS']]

Y = data[['Ca_P', 'Mg_P', 'Na_P', 'K_P', 'NH4_P', 'Ba_P', 'Sr_P', 'H_P', 'CO3_P', 'HCO3_P',
                    'SO4_P', 'Cl_P', 'F_P', 'NO3_P', 'PO4_P', 'OH_P', 'SiO2_P', 'B_P', 'CO2_P', 'NH3_P',
                    'Permeate TDS', 'Ca_C', 'Mg_C', 'Na_C', 'K_C', 'NH4_C', 'Ba_C', 'Sr_C', 'H_C',
                    'CO3_C', 'HCO3_C', 'SO4_C', 'Cl_C', 'F_C', 'NO3_C', 'PO4_C', 'OH_C', 'SiO2_C',
                    'B_C', 'CO2_C', 'NH3_C', 'Concentrate TDS', 'CaSO4 / ksp * 100, %_C',
                    'SrSO4 / ksp * 100, %_C', 'BaSO4 / ksp * 100, %_C', 'SiO2 saturation, %_C',
                    'CaF2 / ksp * 100, %_C']]

# Remove columns with zero variance (columns with only one unique value)
low_variance_columns = [col for col in Y.columns if Y[col].nunique() <= 1]
Y = Y.drop(columns=low_variance_columns)

# Train-Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Scale Input Data
x_scaler = StandardScaler()
X_train_scaled = x_scaler.fit_transform(X_train)
X_test_scaled = x_scaler.transform(X_test)

# Scale Output Data (Fixes Negative R²)
y_scaler = StandardScaler()
Y_train_scaled = y_scaler.fit_transform(Y_train)
Y_test_scaled = y_scaler.transform(Y_test)

# Train KNN Model
knn_model = KNeighborsRegressor(n_neighbors=5, metric='euclidean')  
knn_model.fit(X_train_scaled, Y_train_scaled)

# Predict
Y_pred_scaled = knn_model.predict(X_test_scaled)

# Convert predictions back to original scale
Y_pred = y_scaler.inverse_transform(Y_pred_scaled)

# Compute Metrics
mae = mean_absolute_error(Y_test, Y_pred)
mse = mean_squared_error(Y_test, Y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(Y_test, Y_pred)  

# Compute R² & MAE for Each Output Parameter
r2_scores = {col: r2_score(Y_test[col], Y_pred[:, i]) for i, col in enumerate(Y_test.columns)}
mae_scores = {col: mean_absolute_error(Y_test[col], Y_pred[:, i]) for i, col in enumerate(Y_test.columns)}

# Print Results
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Overall R² Score: {r2:.4f}")

print("\nR² Scores for Each Output Parameter:")
for col, r2_value in r2_scores.items():
    print(f"{col}: {r2_value:.4f}")

print("\nMean Absolute Error for Each Output Parameter:")
for col, mae_value in mae_scores.items():
    print(f"{col}: {mae_value:.4f}")

# Predict for Row 390
row_390 = data.iloc[[389]]  
X_row_390 = row_390[X.columns]
X_row_390_scaled = x_scaler.transform(X_row_390)

predicted_row_390_scaled = knn_model.predict(X_row_390_scaled)
predicted_row_390 = y_scaler.inverse_transform(predicted_row_390_scaled)

# Compare Actual vs Predicted
print("\nActual vs Predicted for Row 390:")
for i, col in enumerate(Y_test.columns):
    print(f"{col}: Actual = {row_390[col].values[0]:.4f}, Predicted = {predicted_row_390[0][i]:.4f}")


Mean Absolute Error (MAE): 28.3836
Mean Squared Error (MSE): 234832.2730
Root Mean Squared Error (RMSE): 484.5950
Overall R² Score: 0.8618

R² Scores for Each Output Parameter:
Ca_P: 0.8824
Mg_P: 0.9749
Na_P: 0.9879
K_P: 0.6236
H_P: 0.9585
HCO3_P: 0.9562
SO4_P: 0.9905
Cl_P: 0.9916
SiO2_P: 0.8895
CO2_P: 1.0000
Permeate TDS: 0.9901
Ca_C: 1.0000
Mg_C: 1.0000
Na_C: 1.0000
K_C: 0.8407
CO3_C: 0.9991
HCO3_C: 0.9959
SO4_C: -1.1085
Cl_C: 1.0000
CO2_C: 0.9982
Concentrate TDS: 0.9895
CaSO4 / ksp * 100, %_C: 1.0000

Mean Absolute Error for Each Output Parameter:
Ca_P: 0.0013
Mg_P: 0.0037
Na_P: 2.1092
K_P: 0.1314
H_P: 0.0000
HCO3_P: 0.1301
SO4_P: 0.0876
Cl_P: 2.7832
SiO2_P: 0.0008
CO2_P: 0.0000
Permeate TDS: 4.9429
Ca_C: 0.0356
Mg_C: 0.5321
Na_C: 4.9616
K_C: 0.1022
CO3_C: 0.0354
HCO3_C: 0.8713
SO4_C: 355.0764
Cl_C: 3.2974
CO2_C: 0.0220
Concentrate TDS: 249.3148
CaSO4 / ksp * 100, %_C: 0.0000

Actual vs Predicted for Row 390:
Ca_P: Actual = 0.0590, Predicted = 0.0582
Mg_P: Actual = 0.2790, Predicted