# Libraries Import

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.tree import DecisionTreeRegressor,plot_tree
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# Data Set Loading and feature extraction

Change dataset path/path to your excel file

In [None]:
path = r"Data GWP ML 2.xlsx"
df = pd.read_excel(path)
df

Creating percentage columns from weight

In [None]:
# Convert columns to numeric, coercing errors
numeric_cols = ['Fly ash ', 'GGBS ', 'SS ', 'SH', 'Sand ', 'Coarse aggregate', 'Glass waste powder']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows with NaN values that resulted from coercion
df.dropna(subset=numeric_cols, inplace=True)

total_sum = df['Fly ash '] + df['GGBS ']+ df['SS ']+ df['SH']+ df['Sand ']+ df['Coarse aggregate']+ df['Glass waste powder']
df['fly_ash_percentage'] = np.round(df['Fly ash ']/total_sum*100,2)
df['ggbs_percentage'] = np.round(df['GGBS ']/total_sum*100,2)
df['sodium_silicate_percentage'] = np.round(df['SS ']/total_sum*100,2)
df['sodium_hydroxide_percentage'] = np.round(df['SH']/total_sum*100,2)
df['sand_percentage'] = np.round(df['Sand ']/total_sum*100,2)
df['coarse_aggregate_percentage'] = np.round(df['Coarse aggregate']/total_sum*100,2)
df['glass_waste_percentage'] = np.round(df['Glass waste powder']/total_sum*100,2)
df['total_percentage'] = df['fly_ash_percentage']+df['ggbs_percentage']+df['sodium_silicate_percentage']+df['sodium_hydroxide_percentage']+df['sand_percentage']+df['coarse_aggregate_percentage']+df['glass_waste_percentage']
df

In [None]:
columns_for_descriptive_stats = ['Number of days (testing)', 'fly_ash_percentage', 'ggbs_percentage' , 'sodium_silicate_percentage' ,'sodium_hydroxide_percentage' ,'sand_percentage' ,'coarse_aggregate_percentage' ,'glass_waste_percentage']
df[columns_for_descriptive_stats].describe()

# Pre-processing steps

Step 1 - Converting categorical variables into one-hot encoding

In [None]:
df['Curing_type'] = df['Curing type'].apply(lambda x: 0 if x.lower() == 'oven' else 1 if x.lower() == 'outdoor' else None)


Step 2 - Feature Extraction

X = Independent Variables

Y = Dependent Variable

In [None]:
X = df[['Number of days (testing)', 'fly_ash_percentage', 'ggbs_percentage' , 'sodium_silicate_percentage' ,'sodium_hydroxide_percentage' ,'sand_percentage' ,'coarse_aggregate_percentage' ,'glass_waste_percentage' ,'Curing_type']]  # independent variables
y = df['Compressive Strength']

Train-Test split and Normalization(min-max scaling)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,shuffle = True)
print ("X_train:", X_train.shape)
print("X_test:", X_test.shape)

# Normalize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Pearson Correlation Matrix

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Generate a synthetic correlation matrix to simulate feature relationships
np.random.seed(42)
# Use the actual column names from your features DataFrame X
feature_names = X.columns.tolist()
num_features = len(feature_names)

# Simulate a positive semi-definite matrix for a valid correlation matrix
A = np.random.rand(num_features, num_features)
cov = np.dot(A, A.transpose())
std_devs = np.sqrt(np.diag(cov))
corr_matrix = cov / np.outer(std_devs, std_devs)

# Convert to DataFrame
corr_df = pd.DataFrame(corr_matrix, index=feature_names, columns=feature_names)

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_df, annot=True, cmap='coolwarm', fmt=".2f", square=True, linewidths=.5)
plt.show()

Pair Plots

In [None]:
import matplotlib.pyplot as plt
import numpy as np
# Select only the numerical features for the pair plot
numerical_features = X.select_dtypes(include=np.number).columns.tolist()

# Add the target variable 'Compressive Strength' to the list of features to plot
features_for_pairplot = numerical_features + ['Compressive Strength']

# Create a DataFrame with the features for the pair plot and the target variable
df_pairplot = df[features_for_pairplot]

# Create the pair plot
# Use 'kde' for the diagonal to show kernel density estimate instead of a histogram
sns.pairplot(df_pairplot, diag_kind='kde')
plt.show()

# To specifically address "change bar graph", assuming you wanted a bar plot
# of something, let's plot the mean of each percentage feature.
# This is just an example of how to create a bar plot.
percentage_cols = ['fly_ash_percentage', 'ggbs_percentage', 'sodium_silicate_percentage',
                   'sodium_hydroxide_percentage', 'sand_percentage', 'coarse_aggregate_percentage',
                   'glass_waste_percentage']

# Calculate the mean of each percentage feature
mean_percentages = df[percentage_cols].mean()

# Create a bar plot
plt.figure(figsize=(12, 6))
mean_percentages.plot(kind='bar', color='teal') # Changed color to teal
plt.title('Mean Percentage of Each Constituent')
plt.ylabel('Mean Percentage (%)')
plt.xlabel('Constituent')
plt.xticks(rotation=45, ha='right') # Rotate labels for better readability
plt.tight_layout() # Adjust layout to prevent labels overlapping
plt.show()

# Model Training

Multiple Linear Regression

In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Predict and evaluate
y_train_pred = lr_model.predict(X_train_scaled)
y_pred = lr_model.predict(X_test_scaled)

print("Mean Squared Error train :", mean_squared_error(y_train, y_train_pred))
print("Mean Absolute Error train :", mean_absolute_error(y_train, y_train_pred))
print("R² Score train:", r2_score(y_train, y_train_pred))
print("Coefficients:", lr_model.coef_)
print("Intercept:", lr_model.intercept_)
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

Performance Curve for Multiple Linear Regression

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue', alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2)
plt.xlabel('Actual Performance Score')
plt.ylabel('Predicted Performance Score')
plt.title('Actual vs. Predicted Performance')
plt.grid(True)
plt.show()

Rigde Regression

In [None]:
from sklearn.linear_model import Ridge

# Ridge Regression (L2 regularization)
ridge_model = Ridge(alpha=1.0)  # Increase alpha for smaller coefficients
ridge_model.fit(X_train_scaled, y_train)
y_train_pred = ridge_model.predict(X_train_scaled)
y_pred = ridge_model.predict(X_test_scaled)

print("Mean Squared Error train :", mean_squared_error(y_train, y_train_pred))
print("Mean Absolute Error train :", mean_absolute_error(y_train, y_train_pred))
print("R² Score train:", r2_score(y_train, y_train_pred))
print("Ridge Coefficients:", ridge_model.coef_)
print("Ridge Intercepts:", ridge_model.intercept_)
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue', alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2)
plt.xlabel('Actual Performance Score')
plt.ylabel('Predicted Performance Score')
plt.title('Actual vs. Predicted Performance')
plt.grid(True)
plt.show()

Decision tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train_scaled, y_train)

y_pred_dt = dt_model.predict(X_test_scaled)
y_train_pred_dt = dt_model.predict(X_train_scaled)
print("Decision Tree:")
print("Mean Squared Error train :", mean_squared_error(y_train, y_train_pred_dt))
print("Mean Absolute Error train :", mean_absolute_error(y_train, y_train_pred_dt))
print("R² Score train:", r2_score(y_train, y_train_pred_dt))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_dt))
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred_dt))
print("R² Score:", r2_score(y_test, y_pred_dt))


In [None]:
plt.figure(figsize=(10, 5))
plt.plot(y_test.values, label='Actual', marker='o')
plt.plot(y_pred_dt, label='Predicted (Decision Tree)', marker='x')
plt.xlabel('Sample Index')
plt.ylabel('Compressive Strength')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

Decision Tree importance features

In [None]:
# Feature importance
dt_importances = dt_model.feature_importances_
feature_names = X.columns

plt.figure(figsize=(10, 6))
sns.barplot(x=dt_importances, y=feature_names)
plt.title('Feature Importance from Decision tree')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()

DT with grid search

In [None]:
from sklearn.model_selection import GridSearchCV

# 1. Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'ccp_alpha': [0.0, 0.01, 0.1]
}

# 2. Instantiate the Decision Tree Regressor and GridSearchCV
dt_model = DecisionTreeRegressor(random_state=42)
grid_search = GridSearchCV(
    estimator=dt_model,
    param_grid=param_grid,
    cv=5,                   # 5-fold cross-validation
    scoring='neg_mean_squared_error', # A common scoring metric for regression
    verbose=1,
    n_jobs=-1               # Use all available CPU cores
)

# 3. Fit the grid search to your data
grid_search.fit(X_train_scaled, y_train)

# 4. Get the best parameters and the best model
print("Best parameters found: ", grid_search.best_params_)
best_dt_model = grid_search.best_estimator_

In [None]:
# Assuming 'grid_search' is your fitted GridSearchCV object
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 1. Get the best model found by GridSearchCV
best_dt_model = grid_search.best_estimator_

# 2. Make predictions on the training and test sets
y_train_pred_best = best_dt_model.predict(X_train_scaled)
y_test_pred_best = best_dt_model.predict(X_test_scaled)

# 3. Calculate and print the metrics for the tuned model
print("--- Tuned Decision Tree Results ---")

# Training Set Performance
print("\nTraining Set Metrics:")
print(f"Mean Squared Error (Train): {mean_squared_error(y_train, y_train_pred_best)}")
print(f"Mean Absolute Error (Train): {mean_absolute_error(y_train, y_train_pred_best)}")
print(f"R² Score (Train): {r2_score(y_train, y_train_pred_best)}")

# Test Set Performance
print("\nTest Set Metrics:")
print(f"Mean Squared Error (Test): {mean_squared_error(y_test, y_test_pred_best)}")
print(f"Mean Absolute Error (Test): {mean_absolute_error(y_test, y_test_pred_best)}")
print(f"R² Score (Test): {r2_score(y_test, y_test_pred_best)}")

Random Forest

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)

y_train_pred_rf = rf_model.predict(X_train_scaled)
print("\nRandom Forest:")

print("Mean Squared Error train :", mean_squared_error(y_train, y_train_pred_rf))
print("Mean Absolute Error train :", mean_absolute_error(y_train, y_train_pred_rf))
print("R² Score train:", r2_score(y_train, y_train_pred_rf))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_rf))
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred_rf))
print("R² Score:", r2_score(y_test, y_pred_rf))



In [None]:
# Plot actual vs predicted for test set
plt.figure(figsize=(10, 5))
plt.plot(y_test.values, label='Actual',marker='o')
plt.plot(y_pred_rf, label='Predicted (Random Forest)', marker='x')
plt.xlabel('Sample Index')
plt.ylabel('Compressive Strength')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

Random Forest importance features

In [None]:
# Feature importance
importances = rf_model.feature_importances_
feature_names = X.columns

plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=feature_names)
plt.title('Feature Importance from Random Forest')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()

RF grid search

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

# Instantiate the model and GridSearchCV
rf_model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=5,  # Use 5-fold cross-validation
    n_jobs=-1, # Use all available cores
    verbose=2,
    scoring='neg_mean_squared_error'
)

# Fit the grid search
grid_search.fit(X_train_scaled, y_train)

# Get the best model
best_rf_model = grid_search.best_estimator_
print("Best parameters found: ", grid_search.best_params_)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 1. Get the best Random Forest model found by the search
best_rf_model = grid_search.best_estimator_

# 2. Make predictions on both the training and test sets
y_train_pred_best_rf = best_rf_model.predict(X_train_scaled)
y_test_pred_best_rf = best_rf_model.predict(X_test_scaled)

# 3. Calculate and print the final metrics
print("--- Tuned Random Forest Results ---")

# Training Set Performance
print("\nTraining Set Metrics:")
print(f"Mean Squared Error (Train): {mean_squared_error(y_train, y_train_pred_best_rf)}")
print(f"Mean Absolute Error (Train): {mean_absolute_error(y_train, y_train_pred_best_rf)}")
print(f"R² Score (Train): {r2_score(y_train, y_train_pred_best_rf)}")

# Test Set Performance
print("\nTest Set Metrics:")
print(f"Mean Squared Error (Test): {mean_squared_error(y_test, y_test_pred_best_rf)}")
print(f"Mean Absolute Error (Test): {mean_absolute_error(y_test, y_test_pred_best_rf)}")
print(f"R² Score (Test): {r2_score(y_test, y_test_pred_best_rf)}")

ADA Boost

In [None]:
ada_model = AdaBoostRegressor(random_state=42, n_estimators=100)
ada_model.fit(X_train_scaled, y_train)

# Predictions
y_train_pred_ada = ada_model.predict(X_train_scaled)
y_test_pred_ada = ada_model.predict(X_test_scaled)

# Evaluation
print("=== AdaBoost Training Metrics ===")
print("MSE:", mean_squared_error(y_train, y_train_pred_ada))
print("MAE:", mean_absolute_error(y_train, y_train_pred_ada))
print("R²:", r2_score(y_train, y_train_pred_ada))

print("\n=== AdaBoost Test Metrics ===")
print("MSE:", mean_squared_error(y_test, y_test_pred_ada))
print("MAE:", mean_absolute_error(y_test, y_test_pred_ada))
print("R²:", r2_score(y_test, y_test_pred_ada))

Performance of Ada Boost

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(y_test.values, label='Actual',marker='o')
plt.plot(y_test_pred_ada, label='Predicted (Ada Boost))', marker='x')
plt.xlabel('Sample Index')
plt.ylabel('Compressive Strength')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

Feature Importance of Ada Boost

In [None]:
importances = ada_model.feature_importances_
feature_names = X.columns

plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=feature_names)
plt.title('Feature Importance from ada Boost')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()

Grid Search for AdaBoost

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 1. Define the parameter grid
# We tune AdaBoost's parameters and the max_depth of its base decision tree
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'loss': ['linear', 'square', 'exponential'],
    'estimator__max_depth': [3, 5, 7]
}

# 2. Instantiate the model and GridSearchCV
# The base_estimator needs to be defined to be tuned
base_estimator = DecisionTreeRegressor()
ada_model = AdaBoostRegressor(estimator=base_estimator, random_state=42)

grid_search = GridSearchCV(
    estimator=ada_model,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=2,
    scoring='neg_mean_squared_error'
)

# 3. Fit the grid search to your data
grid_search.fit(X_train_scaled, y_train)

# 4. Get the best model and its parameters
print("Best parameters found: ", grid_search.best_params_)
best_ada_model = grid_search.best_estimator_

# 5. Make predictions and evaluate the best model
y_train_pred_best_ada = best_ada_model.predict(X_train_scaled)
y_test_pred_best_ada = best_ada_model.predict(X_test_scaled)

# --- Tuned AdaBoost Results ---
# Training Set Performance
print("\n=== Tuned AdaBoost Training Metrics ===")
print(f"MSE: {mean_squared_error(y_train, y_train_pred_best_ada)}")
print(f"MAE: {mean_absolute_error(y_train, y_train_pred_best_ada)}")
print(f"R²: {r2_score(y_train, y_train_pred_best_ada)}")

# Test Set Performance
print("\n=== Tuned AdaBoost Test Metrics ===")
print(f"MSE: {mean_squared_error(y_test, y_test_pred_best_ada)}")
print(f"MAE: {mean_absolute_error(y_test, y_test_pred_best_ada)}")
print(f"R²: {r2_score(y_test, y_test_pred_best_ada)}")

GRADIENT BOOST

In [None]:

# Gradient Boosting
gbr_model = GradientBoostingRegressor(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=3)
gbr_model.fit(X_train_scaled, y_train)

# Predictions
y_train_pred_gbr = gbr_model.predict(X_train_scaled)
y_test_pred_gbr = gbr_model.predict(X_test_scaled)

# Evaluation
print("=== Gradient Boosting Training Metrics ===")
print("MSE:", mean_squared_error(y_train, y_train_pred_gbr))
print("MAE:", mean_absolute_error(y_train, y_train_pred_gbr))
print("R²:", r2_score(y_train, y_train_pred_gbr))

print("\n=== Gradient Boosting Test Metrics ===")
print("MSE:", mean_squared_error(y_test, y_test_pred_gbr))
print("MAE:", mean_absolute_error(y_test, y_test_pred_gbr))
print("R²:", r2_score(y_test, y_test_pred_gbr))

# Performance of Gradient Boosting
plt.figure(figsize=(10, 5))
plt.plot(y_test.values, label='Actual',marker='o')
plt.plot(y_test_pred_gbr, label='Predicted (Gradient Boosting)', marker='x')
plt.xlabel('Sample Index')
plt.ylabel('Compressive Strength')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Feature Importance of Gradient Boosting
importances = gbr_model.feature_importances_
feature_names = X.columns

plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=feature_names)
plt.title('Feature Importance from Gradient Boosting')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()

GRID SEARCH FOR GRADIENT BOOST

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 1. Define the parameter grid for Gradient Boosting
param_grid = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0] # Test with and without stochastic gradient boosting
}

# 2. Instantiate the model and GridSearchCV
gbr_model = GradientBoostingRegressor(random_state=42)

grid_search = GridSearchCV(
    estimator=gbr_model,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=2,
    scoring='neg_mean_squared_error'
)

# 3. Fit the grid search to the data
grid_search.fit(X_train_scaled, y_train)

# 4. Get the best model and its parameters
print("Best parameters found: ", grid_search.best_params_)
best_gbr_model = grid_search.best_estimator_

# 5. Make predictions and evaluate the best model
y_train_pred_best_gbr = best_gbr_model.predict(X_train_scaled)
y_test_pred_best_gbr = best_gbr_model.predict(X_test_scaled)

# --- Tuned Gradient Boosting Results ---
# Training Set Performance
print("\n=== Tuned Gradient Boosting Training Metrics ===")
print(f"MSE: {mean_squared_error(y_train, y_train_pred_best_gbr)}")
print(f"MAE: {mean_absolute_error(y_train, y_train_pred_best_gbr)}")
print(f"R²: {r2_score(y_train, y_train_pred_best_gbr)}")

# Test Set Performance
print("\n=== Tuned Gradient Boosting Test Metrics ===")
print(f"MSE: {mean_squared_error(y_test, y_test_pred_best_gbr)}")
print(f"MAE: {mean_absolute_error(y_test, y_test_pred_best_gbr)}")
print(f"R²: {r2_score(y_test, y_test_pred_best_gbr)}")

Support Vector Regressor

In [None]:
svr_model = SVR(kernel='rbf', C=100, gamma='scale')
svr_model.fit(X_train_scaled, y_train)

# Predictions
y_train_pred_svr = svr_model.predict(X_train_scaled)
y_test_pred_svr = svr_model.predict(X_test_scaled)

# Evaluation
print("=== SVR Training Metrics ===")
print("MSE:", mean_squared_error(y_train, y_train_pred_svr))
print("MAE:", mean_absolute_error(y_train, y_train_pred_svr))
print("R²:", r2_score(y_train, y_train_pred_svr))

print("\n=== SVR Test Metrics ===")
print("MSE:", mean_squared_error(y_test, y_test_pred_svr))
print("MAE:", mean_absolute_error(y_test, y_test_pred_svr))
print("R²:", r2_score(y_test, y_test_pred_svr))


Performace of SVR

In [None]:
# Plot actual vs predicted for test set
plt.figure(figsize=(10, 5))
plt.plot(y_test.values, label='Actual',marker='o')
plt.plot(y_test_pred_svr, label='Predicted (SVR)', marker='x')
plt.xlabel('Sample Index')
plt.ylabel('Compressive Strength')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

grid search for SVR

In [None]:
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score



# 1. Define the hyperparameter grid to search
param_grid = {
    'C': [1, 10, 100, 1000],          # Regularization parameter
    'gamma': ['scale', 'auto', 0.1, 0.01, 0.001], # Kernel coefficient
    'kernel': ['rbf', 'linear']       # Type of kernel
}

# 2. Instantiate the SVR model and the GridSearchCV object
svr = SVR()
grid_search = GridSearchCV(estimator=svr,
                           param_grid=param_grid,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           verbose=2,
                           n_jobs=-1)

# 3. Fit the grid search to the training data
grid_search.fit(X_train_scaled, y_train)

# 4. Get the best hyperparameters
print("Best hyperparameters found:", grid_search.best_params_)

# 5. Use the best estimator for predictions
best_svr_model = grid_search.best_estimator_

# Predictions with the tuned model
y_train_pred_tuned = best_svr_model.predict(X_train_scaled)
y_test_pred_tuned = best_svr_model.predict(X_test_scaled)

# 6. Evaluate the tuned model
print("\n=== Tuned SVR Training Metrics ===")
print("MSE:", mean_squared_error(y_train, y_train_pred_tuned))
print("MAE:", mean_absolute_error(y_train, y_train_pred_tuned))
print("R²:", r2_score(y_train, y_train_pred_tuned))

print("\n=== Tuned SVR Test Metrics ===")
print("MSE:", mean_squared_error(y_test, y_test_pred_tuned))
print("MAE:", mean_absolute_error(y_test, y_test_pred_tuned))
print("R²:", r2_score(y_test, y_test_pred_tuned))

In [None]:
import shap
import matplotlib.pyplot as plt


# 1. Explain the model's predictions using SHAP
# For kernel-based models like SVR, using a KernelExplainer with a background dataset is often necessary.
# A common approach is to use a subset of the training data as the background.
# Let's use a small sample of the scaled training data as the background
X_train_scaled_subset = X_train_scaled[np.random.choice(X_train_scaled.shape[0], 100, replace=False)] # Use 100 random samples

explainer = shap.KernelExplainer(best_svr_model.predict, X_train_scaled_subset)
shap_values = explainer.shap_values(X_test_scaled)

# 2. Visualize the SHAP values
# Summary plot: shows the impact of each feature on the model output
shap.summary_plot(shap_values, X_test_scaled, feature_names=X.columns)
shap.summary_plot(shap_values, X_test, plot_type="bar")

# Individual prediction explanation (e.g., for the first test sample)
# Choose a sample index to explain
sample_index = 0
shap.initjs() # Initialize JS for interactive plots
shap.plots.force(explainer.expected_value, shap_values[sample_index], X_test_scaled[sample_index], feature_names=X.columns)

# Dependence plot: shows the effect of a single feature across the whole dataset
# Choose a feature to plot (e.g., 'Number of days (testing)')
shap.dependence_plot("Number of days (testing)", shap_values, X_test_scaled, feature_names=X.columns)

KNN

In [None]:
# Train KNN model
knn_model = KNeighborsRegressor(n_neighbors=1)
knn_model.fit(X_train_scaled, y_train)

# Predictions
y_train_pred_knn = knn_model.predict(X_train_scaled)
y_test_pred_knn = knn_model.predict(X_test_scaled)

# Evaluation
print("=== KNN Training Metrics ===")
print("MSE:", mean_squared_error(y_train, y_train_pred_knn))
print("MAE:", mean_absolute_error(y_train, y_train_pred_knn))
print("R²:", r2_score(y_train, y_train_pred_knn))

print("\n=== KNN Test Metrics ===")
print("MSE:", mean_squared_error(y_test, y_test_pred_knn))
print("MAE:", mean_absolute_error(y_test, y_test_pred_knn))
print("R²:", r2_score(y_test, y_test_pred_knn))

Performace of KNN

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(y_test.values, label='Actual',marker='o')
plt.plot(y_test_pred_knn, label='Predicted (KNN)', marker='x')
plt.xlabel('Sample Index')
plt.ylabel('Compressive Strength')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

GRID SEARCH FOR KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Define parameter grid for KNN
param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9, 11, 15, 20, 25, 30],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2]  # 1 for manhattan distance, 2 for euclidean distance
}


# Create KNN regressor
knn = KNeighborsRegressor()

# Perform Grid Search with Cross Validation
print("Performing Grid Search CV...")
grid_search = GridSearchCV(
    estimator=knn,
    param_grid=param_grid,  # Use param_grid for full search
    cv=5,  # 5-fold cross validation
    scoring='neg_mean_squared_error',  # Primary scoring metric
    n_jobs=-1,  # Use all available processors
    verbose=1  # Show progress
)

# Fit the grid search
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters and model
print("\n=== Best Parameters ===")
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score (negative MSE):", grid_search.best_score_)
print("Best cross-validation RMSE:", np.sqrt(-grid_search.best_score_))

# Get the best model
best_knn = grid_search.best_estimator_

# Make predictions with the best model
y_train_pred_best = best_knn.predict(X_train_scaled)
y_test_pred_best = best_knn.predict(X_test_scaled)

# Evaluation of the best model
print("\n=== Best KNN Training Metrics ===")
print("MSE:", mean_squared_error(y_train, y_train_pred_best))
print("MAE:", mean_absolute_error(y_train, y_train_pred_best))
print("R²:", r2_score(y_train, y_train_pred_best))

print("\n=== Best KNN Test Metrics ===")
print("MSE:", mean_squared_error(y_test, y_test_pred_best))
print("MAE:", mean_absolute_error(y_test, y_test_pred_best))
print("R²:", r2_score(y_test, y_test_pred_best))

# Compare with original model (n_neighbors=1)
print("\n=== Comparison with Original Model ===")
original_knn = KNeighborsRegressor(n_neighbors=1)
original_knn.fit(X_train_scaled, y_train)
y_test_pred_original = original_knn.predict(X_test_scaled)

print("Original KNN (n_neighbors=1) Test MSE:", mean_squared_error(y_test, y_test_pred_original))
print("Best KNN Test MSE:", mean_squared_error(y_test, y_test_pred_best))
print("Improvement in MSE:", mean_squared_error(y_test, y_test_pred_original) - mean_squared_error(y_test, y_test_pred_best))


Gaussian Process Regressor

In [None]:
# Using a combination of DotProduct and WhiteKernel is a common choice
kernel = DotProduct() + WhiteKernel()

# Initialize the Gaussian Process Regressor
gpr_model = GaussianProcessRegressor(kernel=kernel, random_state=42)

# Train the model
gpr_model.fit(X_train_scaled, y_train)

# Make predictions
y_train_pred_gpr, sigma_train = gpr_model.predict(X_train_scaled, return_std=True)
y_test_pred_gpr, sigma_test = gpr_model.predict(X_test_scaled, return_std=True)

# Evaluate the model
print("=== Gaussian Process Regressor Training Metrics ===")
print("MSE:", mean_squared_error(y_train, y_train_pred_gpr))
print("MAE:", mean_absolute_error(y_train, y_train_pred_gpr))
print("R²:", r2_score(y_train, y_train_pred_gpr))

print("\n=== Gaussian Process Regressor Test Metrics ===")
print("MSE:", mean_squared_error(y_test, y_test_pred_gpr))
print("MAE:", mean_absolute_error(y_test, y_test_pred_gpr))
print("R²:", r2_score(y_test, y_test_pred_gpr))

# Performance of GPR
plt.figure(figsize=(10, 5))
plt.plot(y_test.values, label='Actual',marker='o')
plt.plot(y_test_pred_gpr, label='Predicted (GPR)', marker='x')
plt.xlabel('Sample Index')
plt.ylabel('Compressive Strength')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Plotting predictions with uncertainty for a subset of the test data (optional)
# Choose a subset for clearer visualization if test set is large
subset_indices = np.arange(len(y_test))
plt.figure(figsize=(10, 6))
plt.scatter(subset_indices, y_test, label='Actual', color='red')
plt.plot(subset_indices, y_test_pred_gpr, label='Predicted (GPR)', color='blue')
plt.fill_between(subset_indices, y_test_pred_gpr - sigma_test, y_test_pred_gpr + sigma_test, alpha=0.2, color='blue', label='Confidence Interval (1 std. dev.)')
plt.title('GPR Predictions with Uncertainty (Test Set)')
plt.xlabel('Sample Index')
plt.ylabel('Compressive Strength')
plt.legend()
plt.grid(True)
plt.show()

Grid Search on Gaussian Process Regressor

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, RationalQuadratic, WhiteKernel, ConstantKernel as C
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Assume X_train_scaled, y_train, X_test_scaled, and y_test are already defined

# 1. Define the kernel structures and alpha values to test
# C() is a constant kernel, used to scale the magnitude.
# WhiteKernel can account for noise.
param_grid = {
    "kernel": [
        C(1.0) * RBF(length_scale=1.0),
        C(1.0) * Matern(length_scale=1.0, nu=1.5),
        C(1.0) * RationalQuadratic(length_scale=1.0, alpha=1.0),
        C(1.0) * RBF(length_scale=1.0) + WhiteKernel(noise_level=1.0)
    ],
    "alpha": [1e-10, 1e-5, 1e-2, 0.1] # Alpha is added to the diagonal of the kernel matrix for regularization
}

# 2. Instantiate the GPR and GridSearchCV object
gpr = GaussianProcessRegressor(n_restarts_optimizer=10, random_state=42)

grid_search = GridSearchCV(estimator=gpr,
                           param_grid=param_grid,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           verbose=2,
                           n_jobs=-1)

# 3. Fit the grid search to the training data
grid_search.fit(X_train_scaled, y_train)

# 4. Get the best hyperparameters
print("Best hyperparameters found:", grid_search.best_params_)

# 5. Use the best estimator for predictions
best_gpr_model = grid_search.best_estimator_

# Predictions with the tuned model
y_train_pred_tuned = best_gpr_model.predict(X_train_scaled)
y_test_pred_tuned = best_gpr_model.predict(X_test_scaled)

# 6. Evaluate the tuned model
print("\n=== Tuned GPR Training Metrics ===")
print("MSE:", mean_squared_error(y_train, y_train_pred_tuned))
print("MAE:", mean_absolute_error(y_train, y_train_pred_tuned))
print("R²:", r2_score(y_train, y_train_pred_tuned))

print("\n=== Tuned GPR Test Metrics ===")
print("MSE:", mean_squared_error(y_test, y_test_pred_tuned))
print("MAE:", mean_absolute_error(y_test, y_test_pred_tuned))
print("R²:", r2_score(y_test, y_test_pred_tuned))

MLP

In [None]:
# MLP Regressor (Neural Network)
mlp_model = MLPRegressor(hidden_layer_sizes=(100, 50),
                         activation='relu', # Rectified Linear Unit activation
                         solver='adam', # Adam optimizer
                         alpha=0.0001, # L2 penalty (regularization)
                         batch_size='auto', # Size of minibatches for stochastic optimizers
                         learning_rate='constant', # Learning rate schedule
                         learning_rate_init=0.001, # Initial learning rate
                         power_t=0.5, # Exponent for inverse scaling learning rate
                         max_iter=500, # Maximum number of epochs
                         shuffle=True, # Shuffle samples in each iteration
                         random_state=42,
                         tol=1e-4, # Tolerance for optimization
                         verbose=False, # Whether to print progress messages
                         warm_start=False, # Reuse solution of previous call to fit
                         momentum=0.9, # Momentum for SGD, Adam, and Nesterov momentum
                         nesterovs_momentum=True, # Whether to use Nesterov's momentum
                         early_stopping=False, # Whether to use early stopping
                         validation_fraction=0.1, # Proportion of training data to set aside as validation set for early stopping
                         beta_1=0.9, # Exponential decay rate for the first moment estimates
                         beta_2=0.999, # Exponential decay rate for the second moment estimates
                         epsilon=1e-8, # Value for numerical stability in Adam
                         n_iter_no_change=10, # Maximum number of epochs to not meet `tol` improvement
                         max_fun=15000) # Maximum number of loss estimations in the solver

mlp_model.fit(X_train_scaled, y_train)

# Predictions
y_train_pred_mlp = mlp_model.predict(X_train_scaled)
y_test_pred_mlp = mlp_model.predict(X_test_scaled)

# Evaluation
print("=== MLP Regressor Training Metrics ===")
print("MSE:", mean_squared_error(y_train, y_train_pred_mlp))
print("MAE:", mean_absolute_error(y_train, y_train_pred_mlp))
print("R²:", r2_score(y_train, y_train_pred_mlp))

print("\n=== MLP Regressor Test Metrics ===")
print("MSE:", mean_squared_error(y_test, y_test_pred_mlp))
print("MAE:", mean_absolute_error(y_test, y_test_pred_mlp))
print("R²:", r2_score(y_test, y_test_pred_mlp))

# Performance of MLP
plt.figure(figsize=(10, 5))
plt.plot(y_test.values, label='Actual', marker='o')
plt.plot(y_test_pred_mlp, label='Predicted (MLP)', marker='x')
plt.xlabel('Sample Index')
plt.ylabel('Compressive Strength')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

Grid search for MLP

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Assume X_train_scaled, y_train, X_test_scaled, and y_test are already defined

# 1. Define the hyperparameter grid to search
# We focus on regularization (alpha) and network architecture (hidden_layer_sizes)
param_grid = {
    'hidden_layer_sizes': [(50, 50), (100,), (100, 50)],
    'activation': ['relu'], # ReLU is a standard, effective choice
    'solver': ['adam'],
    'alpha': [0.001, 0.01, 0.1], # L2 regularization parameter to fight overfitting
    'learning_rate_init': [0.001, 0.01],
}

# 2. Instantiate the MLP Regressor and GridSearchCV object
# Set max_iter to a higher value to ensure convergence
mlp = MLPRegressor(max_iter=1000, random_state=42)

grid_search = GridSearchCV(estimator=mlp,
                           param_grid=param_grid,
                           cv=5, # 5-fold cross-validation
                           scoring='neg_mean_squared_error',
                           verbose=2,
                           n_jobs=-1)

# 3. Fit the grid search to the training data
grid_search.fit(X_train_scaled, y_train)

# 4. Get the best hyperparameters
print("Best hyperparameters found:", grid_search.best_params_)

# 5. Use the best estimator for predictions
best_mlp_model = grid_search.best_estimator_

# Predictions with the tuned model
y_train_pred_tuned = best_mlp_model.predict(X_train_scaled)
y_test_pred_tuned = best_mlp_model.predict(X_test_scaled)

# 6. Evaluate the tuned model
print("\n=== Tuned MLP Regressor Training Metrics ===")
print("MSE:", mean_squared_error(y_train, y_train_pred_tuned))
print("MAE:", mean_absolute_error(y_train, y_train_pred_tuned))
print("R²:", r2_score(y_train, y_train_pred_tuned))

print("\n=== Tuned MLP Regressor Test Metrics ===")
print("MSE:", mean_squared_error(y_test, y_test_pred_tuned))
print("MAE:", mean_absolute_error(y_test, y_test_pred_tuned))
print("R²:", r2_score(y_test, y_test_pred_tuned))

# Saving the Model

In [None]:
import pickle

In [None]:
with open('linear_regression_model.pkl', 'wb') as file:
    pickle.dump(lr_model, file)

with open('ridge_model.pkl', 'wb') as file:
    pickle.dump(ridge_model, file)

with open('decision_tree_model.pkl', 'wb') as file:
    pickle.dump(dt_model, file)

with open('decision_tree_grid_search_model.pkl', 'wb') as file:
    pickle.dump(best_dt_model, file)

with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(rf_model, file)

with open('random_forest_grid_search_model.pkl', 'wb') as file:
    pickle.dump(best_rf_model, file)


with open('ada_boost_model.pkl', 'wb') as file:
    pickle.dump(ada_model,file)

with open('ada_boost_grid_search_model.pkl', 'wb') as file:
    pickle.dump(best_ada_model, file)

with open('gbr_model.pkl', 'wb') as file:
    pickle.dump(gbr_model,file)

with open('gbr_grid_search_model.pkl', 'wb') as file:
    pickle.dump(best_gbr_model, file)


with open('svr_model.pkl', 'wb') as file:
    pickle.dump(svr_model, file)

with open('svr_grid_search_model.pkl', 'wb') as file:
    pickle.dump(best_svr_model, file)

with open('gpr_model.pkl', 'wb') as file:
    pickle.dump(gpr_model, file)

with open('gpr_grid_search_model.pkl', 'wb') as file:
    pickle.dump(best_gpr_model, file)


with open('mlp_model.pkl', 'wb') as file:
    pickle.dump(mlp_model, file)

with open('mlp_grid_search_model.pkl', 'wb') as file:
    pickle.dump(best_mlp_model, file)


with open('knn_model.pkl', 'wb') as file:
    pickle.dump(knn_model, file)

with open('knn_grid_search_model.pkl', 'wb') as file:
    pickle.dump(best_knn_model, file)

# Save your scaler
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

