<a href="https://colab.research.google.com/github/Sirfowahid/Iron_Slug/blob/main/Iron_Slug_Prediction_With_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
! pip install gplearn

Collecting gplearn
  Downloading gplearn-0.4.2-py3-none-any.whl.metadata (4.3 kB)
Downloading gplearn-0.4.2-py3-none-any.whl (25 kB)
Installing collected packages: gplearn
Successfully installed gplearn-0.4.2


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import xgboost as xgb
from gplearn.genetic import SymbolicRegressor
import warnings
warnings.filterwarnings('ignore')

file_path = '/content/drive/MyDrive/Projects/26. Iron Slug/CS.xlsx'
data = pd.read_excel(file_path)

features = ['Cement (kg/m3) ', 'CA (kg/m3)', 'FA (kg/m3)', 'WIS (kg/m3)',
       'Water (kg/m3)', 'W/C', 'Concrete Age (Days)']
target = 'Measured CS (MPa) '

X = data[features]
y = data[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the scatter index function
def scatter_index(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mean_true = np.mean(y_true)
    scatter_idx = (rmse / mean_true) * 100
    return scatter_idx

# 1. Support Vector Machine (SVM)
svm = SVR()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print("SVM - R²:", r2_score(y_test, y_pred_svm))
print("SVM - RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_svm)))
print("SVM - MAE:", mean_absolute_error(y_test, y_pred_svm))
print("SVM - Scatter Index:", scatter_index(y_test, y_pred_svm))

# 2. Gaussian Process Regressor (GPR)
kernel = C() * RBF()
gpr = GaussianProcessRegressor(kernel=kernel)
gpr.fit(X_train, y_train)
y_pred_gpr, _ = gpr.predict(X_test, return_std=True)
print("GPR - R²:", r2_score(y_test, y_pred_gpr))
print("GPR - RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_gpr)))
print("GPR - MAE:", mean_absolute_error(y_test, y_pred_gpr))
print("GPR - Scatter Index:", scatter_index(y_test, y_pred_gpr))

# 3. Random Forest Regressor
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("RF - R²:", r2_score(y_test, y_pred_rf))
print("RF - RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_rf)))
print("RF - MAE:", mean_absolute_error(y_test, y_pred_rf))
print("RF - Scatter Index:", scatter_index(y_test, y_pred_rf))

# 4. Gradient Boosting Regressor
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)
y_pred_gbr = gbr.predict(X_test)
print("GBR - R²:", r2_score(y_test, y_pred_gbr))
print("GBR - RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_gbr)))
print("GBR - MAE:", mean_absolute_error(y_test, y_pred_gbr))
print("GBR - Scatter Index:", scatter_index(y_test, y_pred_gbr))

# 5. Decision Tree Regressor
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
print("DT - R²:", r2_score(y_test, y_pred_dt))
print("DT - RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_dt)))
print("DT - MAE:", mean_absolute_error(y_test, y_pred_dt))
print("DT - Scatter Index:", scatter_index(y_test, y_pred_dt))

# 6. XGBoost Regressor
xg_reg = xgb.XGBRegressor()
xg_reg.fit(X_train, y_train)
y_pred_xg = xg_reg.predict(X_test)
print("XGBoost - R²:", r2_score(y_test, y_pred_xg))
print("XGBoost - RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_xg)))
print("XGBoost - MAE:", mean_absolute_error(y_test, y_pred_xg))
print("XGBoost - Scatter Index:", scatter_index(y_test, y_pred_xg))

# 7. Symbolic Regression
sr = SymbolicRegressor(population_size=1000, generations=20, tournament_size=20,
                       const_range=(0, 1), init_depth=(2, 6), init_method='half and half',
                       metric='mean absolute error', parsimony_coefficient=0.01, random_state=42)
sr.fit(X_train, y_train)
y_pred_sr = sr.predict(X_test)
print("Symbolic Regression - R²:", r2_score(y_test, y_pred_sr))
print("Symbolic Regression - RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_sr)))
print("Symbolic Regression - MAE:", mean_absolute_error(y_test, y_pred_sr))
print("Symbolic Regression - Scatter Index:", scatter_index(y_test, y_pred_sr))


SVM - R²: -0.09681050964527804
SVM - RMSE: 2.731258986381626
SVM - MAE: 2.3038544723272643
SVM - Scatter Index: 14.466864860098969
GPR - R²: 0.8621267892781315
GPR - RMSE: 0.9683604114630929
GPR - MAE: 0.7666176470588235
GPR - Scatter Index: 5.129187410771951
RF - R²: 0.9005074161125965
RF - RMSE: 0.8226070677242315
RF - MAE: 0.6694408029878581
RF - Scatter Index: 4.357164714538692
GBR - R²: 0.8551478412155855
GBR - RMSE: 0.9925663956873203
GBR - MAE: 0.7828314351184543
GBR - Scatter Index: 5.257401067669245
DT - R²: 0.852653061494441
DT - RMSE: 1.001077360823299
DT - MAE: 0.7907843137254904
DT - Scatter Index: 5.302481736717894
XGBoost - R²: 0.8527387758347799
XGBoost - RMSE: 1.0007861461943013
XGBoost - MAE: 0.7904696026970359
XGBoost - Scatter Index: 5.300939238293541
Symbolic Regression - R²: 0.35147753139461
Symbolic Regression - RMSE: 2.1001948791366734
Symbolic Regression - MAE: 1.8163093717738337
Symbolic Regression - Scatter Index: 11.124260148098907


In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import xgboost as xgb
from gplearn.genetic import SymbolicRegressor
import warnings
warnings.filterwarnings('ignore')

# Load data
file_path = '/content/drive/MyDrive/Projects/26. Iron Slug/CS.xlsx'
data = pd.read_excel(file_path)

features = ['Cement (kg/m3) ', 'CA (kg/m3)', 'FA (kg/m3)', 'WIS (kg/m3)',
            'Water (kg/m3)', 'W/C', 'Concrete Age (Days)']
target = 'Measured CS (MPa) '

X = data[features]
y = data[target]

# Define the scatter index function
def scatter_index(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mean_true = np.mean(y_true)
    scatter_idx = (rmse / mean_true)
    return scatter_idx

# 1. Support Vector Machine (SVM)
svm = SVR()
svm.fit(X, y)
y_pred_svm = svm.predict(X)
print("SVM - R²:", r2_score(y, y_pred_svm))
print("SVM - RMSE:", np.sqrt(mean_squared_error(y, y_pred_svm)))
print("SVM - MAE:", mean_absolute_error(y, y_pred_svm))
print("SVM - Scatter Index:", scatter_index(y, y_pred_svm))

# 2. Gaussian Process Regressor (GPR)
kernel = C() * RBF()
gpr = GaussianProcessRegressor(kernel=kernel)
gpr.fit(X, y)
y_pred_gpr, _ = gpr.predict(X, return_std=True)
print("GPR - R²:", r2_score(y, y_pred_gpr))
print("GPR - RMSE:", np.sqrt(mean_squared_error(y, y_pred_gpr)))
print("GPR - MAE:", mean_absolute_error(y, y_pred_gpr))
print("GPR - Scatter Index:", scatter_index(y, y_pred_gpr))

# 3. Random Forest Regressor
rf = RandomForestRegressor()
rf.fit(X, y)
y_pred_rf = rf.predict(X)
print("RF - R²:", r2_score(y, y_pred_rf))
print("RF - RMSE:", np.sqrt(mean_squared_error(y, y_pred_rf)))
print("RF - MAE:", mean_absolute_error(y, y_pred_rf))
print("RF - Scatter Index:", scatter_index(y, y_pred_rf))

# 4. Gradient Boosting Regressor
gbr = GradientBoostingRegressor()
gbr.fit(X, y)
y_pred_gbr = gbr.predict(X)
print("GBR - R²:", r2_score(y, y_pred_gbr))
print("GBR - RMSE:", np.sqrt(mean_squared_error(y, y_pred_gbr)))
print("GBR - MAE:", mean_absolute_error(y, y_pred_gbr))
print("GBR - Scatter Index:", scatter_index(y, y_pred_gbr))

# 5. Decision Tree Regressor
dt = DecisionTreeRegressor()
dt.fit(X, y)
y_pred_dt = dt.predict(X)
print("DT - R²:", r2_score(y, y_pred_dt))
print("DT - RMSE:", np.sqrt(mean_squared_error(y, y_pred_dt)))
print("DT - MAE:", mean_absolute_error(y, y_pred_dt))
print("DT - Scatter Index:", scatter_index(y, y_pred_dt))

# 6. XGBoost Regressor
xg_reg = xgb.XGBRegressor()
xg_reg.fit(X, y)
y_pred_xg = xg_reg.predict(X)
print("XGBoost - R²:", r2_score(y, y_pred_xg))
print("XGBoost - RMSE:", np.sqrt(mean_squared_error(y, y_pred_xg)))
print("XGBoost - MAE:", mean_absolute_error(y, y_pred_xg))
print("XGBoost - Scatter Index:", scatter_index(y, y_pred_xg))

# 7. Symbolic Regression
sr = SymbolicRegressor(population_size=1000, generations=20, tournament_size=20,
                       const_range=(0, 1), init_depth=(2, 6), init_method='half and half',
                       metric='mean absolute error', parsimony_coefficient=0.01, random_state=42)
sr.fit(X, y)
y_pred_sr = sr.predict(X)
print("Symbolic Regression - R²:", r2_score(y, y_pred_sr))
print("Symbolic Regression - RMSE:", np.sqrt(mean_squared_error(y, y_pred_sr)))
print("Symbolic Regression - MAE:", mean_absolute_error(y, y_pred_sr))
print("Symbolic Regression - Scatter Index:", scatter_index(y, y_pred_sr))


SVM - R²: 0.029566442108723945
SVM - RMSE: 3.205059679318137
SVM - MAE: 2.6173020570063903
SVM - Scatter Index: 0.17355974784777123
GPR - R²: 0.9522383009778231
GPR - RMSE: 0.7110384747520954
GPR - MAE: 0.5526600864955359
GPR - Scatter Index: 0.03850401263488858
RF - R²: 0.9516444277915076
RF - RMSE: 0.7154453757383663
RF - MAE: 0.5585553836837766
RF - Scatter Index: 0.03874265425173673
GBR - R²: 0.9520707824352939
GBR - RMSE: 0.7122843252057343
GBR - MAE: 0.5573093899127252
GBR - Scatter Index: 0.03857147767835873
DT - R²: 0.9522383261209192
DT - RMSE: 0.7110382875967873
DT - MAE: 0.5525892857142858
DT - Scatter Index: 0.03850400250009194
XGBoost - R²: 0.9522383170843105
XGBoost - RMSE: 0.7110383548617503
XGBoost - MAE: 0.552609078543527
XGBoost - Scatter Index: 0.0385040061426107
Symbolic Regression - R²: 0.47395767461918814
Symbolic Regression - RMSE: 2.3597367816674124
Symbolic Regression - MAE: 2.0690132820594846
Symbolic Regression - Scatter Index: 0.12778399212224292


In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import xgboost as xgb
from gplearn.genetic import SymbolicRegressor
import warnings
warnings.filterwarnings('ignore')

# Load data
file_path = 'CS.xlsx'
data = pd.read_excel(file_path)

features = ['Cement (kg/m3) ', 'CA (kg/m3)', 'FA (kg/m3)', 'WIS (kg/m3)',
            'Water (kg/m3)', 'W/C', 'Concrete Age (Days)']
target = 'Measured CS (MPa) '

X = data[features]
y = data[target]

# Define the scatter index function
def scatter_index(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mean_true = np.mean(y_true)
    scatter_idx = (rmse / mean_true) * 100
    return scatter_idx

# Train models and make predictions
models = {
    'SVM': SVR(),
    'GPR': GaussianProcessRegressor(kernel=C() * RBF()),
    'RF': RandomForestRegressor(),
    'GBR': GradientBoostingRegressor(),
    'DT': DecisionTreeRegressor(),
    'XGBoost': xgb.XGBRegressor(),
    'Symbolic Regression': SymbolicRegressor(population_size=1000, generations=20, tournament_size=20,
                                             const_range=(0, 1), init_depth=(2, 6), init_method='half and half',
                                             metric='mean absolute error', parsimony_coefficient=0.01, random_state=42)
}

predictions = {}

for model_name, model in models.items():
    model.fit(X, y)
    y_pred = model.predict(X)
    predictions[model_name] = y_pred

# Create a DataFrame with actual and predicted values
results_df = pd.DataFrame({
    'Actual': y,
    **{f'{model_name} Prediction': pred for model_name, pred in predictions.items()}
})

# Save to Excel
output_file = 'model_predictions.xlsx'
with pd.ExcelWriter(output_file) as writer:
    results_df.to_excel(writer, index=False)

print(f"Results have been saved to {output_file}")

# Print metrics for each model
for model_name, y_pred in predictions.items():
    print(f"{model_name} - R²:", r2_score(y, y_pred))
    print(f"{model_name} - RMSE:", np.sqrt(mean_squared_error(y, y_pred)))
    print(f"{model_name} - MAE:", mean_absolute_error(y, y_pred))
    print(f"{model_name} - Scatter Index:", scatter_index(y, y_pred))
    print()


Results have been saved to model_predictions.xlsx
SVM - R²: 0.029566442108723945
SVM - RMSE: 3.205059679318137
SVM - MAE: 2.6173020570063903
SVM - Scatter Index: 17.35597478477712

GPR - R²: 0.951208635582489
GPR - RMSE: 0.7186620287778832
GPR - MAE: 0.5693750000000002
GPR - Scatter Index: 3.891684180089684

RF - R²: 0.9515470387485896
RF - RMSE: 0.7161654736509435
RF - MAE: 0.5581712372448979
RF - Scatter Index: 3.8781648849228656

GBR - R²: 0.9520707824352939
GBR - RMSE: 0.7122843252057343
GBR - MAE: 0.5573093899127252
GBR - Scatter Index: 3.857147767835873

DT - R²: 0.9522383261209192
DT - RMSE: 0.7110382875967873
DT - MAE: 0.5525892857142859
DT - Scatter Index: 3.8504002500091943

XGBoost - R²: 0.9522383170843105
XGBoost - RMSE: 0.7110383548617503
XGBoost - MAE: 0.552609078543527
XGBoost - Scatter Index: 3.85040061426107

Symbolic Regression - R²: 0.47395767461918814
Symbolic Regression - RMSE: 2.3597367816674124
Symbolic Regression - MAE: 2.0690132820594846
Symbolic Regression - S

In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import xgboost as xgb
from gplearn.genetic import SymbolicRegressor
import warnings
warnings.filterwarnings('ignore')

# Load data
file_path = '/content/drive/MyDrive/Projects/26. Iron Slug/STS.xlsx'
data = pd.read_excel(file_path)

data.columns


Index(['Cement (kg/m3) ', 'CA (kg/m3)', 'FA (kg/m3)', 'WIS (kg/m3)',
       'Water (kg/m3)', 'W/C', 'Concrete Age (Days)', 'Measured STS (MPa) '],
      dtype='object')

In [None]:

features = ['Cement (kg/m3) ', 'CA (kg/m3)', 'FA (kg/m3)', 'WIS (kg/m3)',
       'Water (kg/m3)', 'W/C', 'Concrete Age (Days)']
target = 'Measured STS (MPa) '

X = data[features]
y = data[target]

# Define the scatter index function
def scatter_index(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mean_true = np.mean(y_true)
    scatter_idx = (rmse / mean_true) * 100
    return scatter_idx

# Train models and make predictions
models = {
    'SVM': SVR(),
    'GPR': GaussianProcessRegressor(kernel=C() * RBF()),
    'RF': RandomForestRegressor(),
    'GBR': GradientBoostingRegressor(),
    'DT': DecisionTreeRegressor(),
    'XGBoost': xgb.XGBRegressor(),
    'Symbolic Regression': SymbolicRegressor(population_size=1000, generations=20, tournament_size=20,
                                             const_range=(0, 1), init_depth=(2, 6), init_method='half and half',
                                             metric='mean absolute error', parsimony_coefficient=0.01, random_state=42)
}

predictions = {}

for model_name, model in models.items():
    model.fit(X, y)
    y_pred = model.predict(X)
    predictions[model_name] = y_pred

# Create a DataFrame with actual and predicted values
results_df = pd.DataFrame({
    'Actual': y,
    **{f'{model_name} Prediction': pred for model_name, pred in predictions.items()}
})

# Save to Excel
output_file = 'model_predictions_sts.xlsx'
with pd.ExcelWriter(output_file) as writer:
    results_df.to_excel(writer, index=False)

print(f"Results have been saved to {output_file}")

# Print metrics for each model
for model_name, y_pred in predictions.items():
    print(f"{model_name} - R²:", r2_score(y, y_pred))
    print(f"{model_name} - RMSE:", np.sqrt(mean_squared_error(y, y_pred)))
    print(f"{model_name} - MAE:", mean_absolute_error(y, y_pred))
    print(f"{model_name} - Scatter Index:", scatter_index(y, y_pred))
    print()


Results have been saved to model_predictions_sts.xlsx
SVM - R²: 0.13524555876159128
SVM - RMSE: 0.23727117361899797
SVM - MAE: 0.1861798453695604
SVM - Scatter Index: 14.32735143698931

GPR - R²: 0.8178556944217639
GPR - RMSE: 0.1088946082884359
GPR - MAE: 0.08205356917210989
GPR - Scatter Index: 6.57547774870866

RF - R²: 0.8161890691685364
RF - RMSE: 0.10939166813469975
RF - MAE: 0.08253074520717374
RF - Scatter Index: 6.605492145291338

GBR - R²: 0.8177110433151029
GBR - RMSE: 0.10893783939427593
GBR - MAE: 0.08226351529956884
GBR - Scatter Index: 6.578088210135273

DT - R²: 0.8178556944218588
DT - RMSE: 0.10889460828840754
DT - MAE: 0.08205357142857142
DT - Scatter Index: 6.5754777487069465

XGBoost - R²: 0.8178537296124084
XGBoost - RMSE: 0.10889519561552101
XGBoost - MAE: 0.0820671217782157
XGBoost - Scatter Index: 6.575513213790357

Symbolic Regression - R²: -0.0011057194695069938
Symbolic Regression - RMSE: 0.2552927897268741
Symbolic Regression - MAE: 0.19795639768328016
Symbo