In [13]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from catboost import CatBoostRegressor
import difflib

In [15]:
def relative_absolute_error(y_true, y_pred):
    return np.sum(np.abs(y_true - y_pred)) / np.sum(np.abs(y_true - np.mean(y_true)))

def root_relative_squared_error(y_true, y_pred):
    return np.sqrt(np.sum((y_true - y_pred) ** 2) / np.sum((y_true - np.mean(y_true)) ** 2))

def willmotts_index_of_agreement(y_true, y_pred):
    return 1 - np.sum((y_true - y_pred) ** 2) / np.sum((np.abs(y_true - np.mean(y_true)) + np.abs(y_pred - np.mean(y_true))) ** 2)

def nash_sutcliffe_efficiency(y_true, y_pred):
    return 1 - np.sum((y_true - y_pred) ** 2) / np.sum((y_true - np.mean(y_true)) ** 2)

In [17]:
df = pd.read_csv(r'D:\2025 Work\Harish Panghal\Worked Data\HP Data worked data 1\Data.csv', encoding='ISO-8859-1')

In [19]:
print(df.columns.tolist())

['NFA (kg/m³)', 'NCA (kg/m³)', 'ATRCA (kg/m³)', 'HTRCA (kg/m³)', 'CTRCA (kg/m³)', 'ACTRCA (kg/m³)', 'HATRCA (kg/m³)', 'HCTRCA (kg/m³)', 'TTRCA (kg/m³)', 'Curing Day', 'Cement (kg/m³)', 'W/C Ratio', 'Admixture (kg/m³)', 'Compressive strength  (MPa) \n']


In [21]:
df.columns = df.columns.str.strip()  
print(df.columns.tolist())

['NFA (kg/m³)', 'NCA (kg/m³)', 'ATRCA (kg/m³)', 'HTRCA (kg/m³)', 'CTRCA (kg/m³)', 'ACTRCA (kg/m³)', 'HATRCA (kg/m³)', 'HCTRCA (kg/m³)', 'TTRCA (kg/m³)', 'Curing Day', 'Cement (kg/m³)', 'W/C Ratio', 'Admixture (kg/m³)', 'Compressive strength  (MPa)']


In [23]:
print(difflib.get_close_matches('Compressive strength (MPa)', df.columns, n=3))

['Compressive strength  (MPa)']


In [25]:
X = df.drop(columns=['Compressive strength  (MPa)'])

In [27]:
y = df['Compressive strength  (MPa)']

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [31]:
catboost_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('catboost', CatBoostRegressor(learning_rate=0.1, depth=6, iterations=100, random_state=42, verbose=0))  # Default parameters
])

In [33]:
param_grid = {
    'catboost__iterations': [50, 100, 200],
    'catboost__learning_rate': [0.01, 0.05, 0.1],
    'catboost__depth': [3, 6, 10],
    'catboost__l2_leaf_reg': [1, 3, 5],
    'catboost__border_count': [32, 50, 100]
}

In [35]:
grid_search = GridSearchCV(catboost_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')

In [37]:
grid_search.fit(X_train, y_train)

In [38]:
print(f"Best parameters found: {grid_search.best_params_}")

Best parameters found: {'catboost__border_count': 100, 'catboost__depth': 3, 'catboost__iterations': 200, 'catboost__l2_leaf_reg': 1, 'catboost__learning_rate': 0.1}


In [39]:
train_predictions = grid_search.predict(X_train)
test_predictions = grid_search.predict(X_test)

In [40]:
train_errors = train_predictions - y_train
test_errors = test_predictions - y_test

In [41]:
train_r2 = r2_score(y_train, train_predictions)
train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
train_mae = mean_absolute_error(y_train, train_predictions)
train_rae = relative_absolute_error(y_train, train_predictions)
train_rrse = root_relative_squared_error(y_train, train_predictions)
train_wi = willmotts_index_of_agreement(y_train, train_predictions)
train_nse = nash_sutcliffe_efficiency(y_train, train_predictions)

In [42]:
test_r2 = r2_score(y_test, test_predictions)
test_rmse = np.sqrt(mean_squared_error(y_test, test_predictions))
test_mae = mean_absolute_error(y_test, test_predictions)
test_rae = relative_absolute_error(y_test, test_predictions)
test_rrse = root_relative_squared_error(y_test, test_predictions)
test_wi = willmotts_index_of_agreement(y_test, test_predictions)
test_nse = nash_sutcliffe_efficiency(y_test, test_predictions)

In [43]:
print("\nTraining Stage Performance Metrics:")
print(f"R²: {train_r2:.4f}")
print(f"RMSE: {train_rmse:.4f}")
print(f"MAE: {train_mae:.4f}")
print(f"RAE: {train_rae:.4f}")
print(f"RRSE: {train_rrse:.4f}")
print(f"WI: {train_wi:.4f}")
print(f"NSE: {train_nse:.4f}")
print(f"Training Error (Mean): {np.mean(train_errors):.4f}")
print(f"Training Error (Standard Deviation): {np.std(train_errors):.4f}")


Training Stage Performance Metrics:
R²: 0.9891
RMSE: 0.7516
MAE: 0.5233
RAE: 0.0881
RRSE: 0.1044
WI: 0.9972
NSE: 0.9891
Training Error (Mean): -0.0005
Training Error (Standard Deviation): 0.7516


In [44]:
print("\nTesting Stage Performance Metrics:")
print(f"R²: {test_r2:.4f}")
print(f"RMSE: {test_rmse:.4f}")
print(f"MAE: {test_mae:.4f}")
print(f"RAE: {test_rae:.4f}")
print(f"RRSE: {test_rrse:.4f}")
print(f"WI: {test_wi:.4f}")
print(f"NSE: {test_nse:.4f}")
print(f"Testing Error (Mean): {np.mean(test_errors):.4f}")
print(f"Testing Error (Standard Deviation): {np.std(test_errors):.4f}")


Testing Stage Performance Metrics:
R²: 0.9776
RMSE: 1.1007
MAE: 0.7910
RAE: 0.1285
RRSE: 0.1496
WI: 0.9941
NSE: 0.9776
Testing Error (Mean): 0.1023
Testing Error (Standard Deviation): 1.0960


In [45]:
train_results = pd.DataFrame({
    'TrueValues': y_train,
    'PredictedValues': train_predictions,
    'Error': train_errors
})
test_results = pd.DataFrame({
    'TrueValues': y_test,
    'PredictedValues': test_predictions,
    'Error': test_errors
})