In [17]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
import shap

In [35]:
from google.colab import files
uploaded = files.upload()  # Opens a dialog for you to select files

# After uploading, load the file into a DataFrame
import pandas as pd
df = pd.read_excel("ProjecT dataset.xlsx")  # Replace with your actual file name
df = df.dropna(axis=1, how='all')  # Drops columns with all values as NaN (empty columns)

# Check the remaining columns to verify unwanted columns are removed
print("Columns after dropping empty ones:", df.columns)

Saving ProjecT dataset.xlsx to ProjecT dataset (2).xlsx
Columns after dropping empty ones: Index(['sampleno.', 'PKS', 'Rice additives', 'RESIN', 'CaCo3', 'Steel',
       'graphite', 'sSiC', 'Al2O3', 'Wheat', 'Nile rose', 'Carbon', 'IRON',
       'Coconut shell', 'Wear Rate(g/km)'],
      dtype='object')


In [36]:
# Define X by dropping the "Sample Number" column
X = df.drop(columns=["Wear Rate(g/km)"])
y = df["Wear Rate(g/km)"]

In [37]:
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
# Standardize the features to improve model performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [39]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Step 1: Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)

# Step 2: Define the hyperparameter grid for tuning
param_grid_rf = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Step 3: Set up RandomizedSearchCV for tuning
random_search_rf = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_grid_rf,
    n_iter=50,  # Number of parameter settings to sample
    scoring='neg_mean_absolute_error',  # MAE for evaluation
    cv=5,  # 5-fold cross-validation
    random_state=42,
    n_jobs=-1  # Use all cores
)

# Fit RandomizedSearchCV to find the best parameters
random_search_rf.fit(X_train, y_train)

# Step 4: Print the best parameters and score
print("Best Parameters for Random Forest:", random_search_rf.best_params_)
print("Best MAE (negated) for Random Forest:", -random_search_rf.best_score_)

# Step 5: Evaluate the best Random Forest model on the test set
best_rf_model = random_search_rf.best_estimator_
y_pred_rf = best_rf_model.predict(X_test)

# Calculate evaluation metrics
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest Model MAE: {mae_rf}")
print(f"Random Forest Model MSE: {mse_rf}")
print(f"Random Forest Model R-squared (R²): {r2_rf}")


Best Parameters for Random Forest: {'n_estimators': 50, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 10, 'bootstrap': False}
Best MAE (negated) for Random Forest: 0.6908097142857146
Random Forest Model MAE: 1.3386593333333343
Random Forest Model MSE: 3.632859679871113
Random Forest Model R-squared (R²): 0.4282438500940765


In [40]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Step 1: Generate synthetic samples by duplicating and adding noise
def augment_with_noise(X, y, noise_level=0.01, n_samples=100):
    # Randomly select samples to duplicate
    X_dup = X.sample(n=n_samples, replace=True, random_state=42).reset_index(drop=True)
    y_dup = y.sample(n=n_samples, replace=True, random_state=42).reset_index(drop=True)

    # Add Gaussian noise to each feature
    X_dup = X_dup + np.random.normal(0, noise_level * X.std(), X_dup.shape)
    y_dup = y_dup + np.random.normal(0, noise_level * y.std(), y_dup.shape)

    # Concatenate original and synthetic samples
    X_augmented = pd.concat([X, X_dup], axis=0).reset_index(drop=True)
    y_augmented = pd.concat([y, y_dup], axis=0).reset_index(drop=True)

    return X_augmented, y_augmented

# Apply noise-based augmentation
X_augmented, y_augmented = augment_with_noise(X, y, noise_level=0.01, n_samples=100)

# Step 2: Split augmented data into train and test sets
X_train_aug, X_test_aug, y_train_aug, y_test_aug = train_test_split(X_augmented, y_augmented, test_size=0.2, random_state=42)

# Step 3: Standardize the augmented features
scaler = StandardScaler()
X_train_aug_scaled = scaler.fit_transform(X_train_aug)
X_test_aug_scaled = scaler.transform(X_test_aug)

# Step 4: Train the Random Forest model on the augmented dataset
model_augmented = RandomForestRegressor(random_state=42)
model_augmented.fit(X_train_aug_scaled, y_train_aug)

# Step 5: Evaluate the model on the test set
y_pred_aug = model_augmented.predict(X_test_aug_scaled)
mae_aug = mean_absolute_error(y_test_aug, y_pred_aug)
mse_aug = mean_squared_error(y_test_aug, y_pred_aug)
r2_aug = r2_score(y_test_aug, y_pred_aug)

print(f"Augmented Model MAE: {mae_aug}")
print(f"Augmented Model MSE: {mse_aug}")
print(f"Augmented Model R-squared (R²): {r2_aug}")

Augmented Model MAE: 0.2821601321930761
Augmented Model MSE: 0.19225571209948192
Augmented Model R-squared (R²): 0.9386178558399393


In [41]:
import random

# Define new input with 'sampleno' and relevant feature values
new_input = pd.DataFrame([{
    "sampleno.": random.randint(1, 5),  # Random number between 1 and 5
    "PKS": 30,  # Example values; replace with actual input
    "Rice additives": 5,
    "RESIN": 20,
    "CaCo3": 15,
    "Steel": 10,
    "graphite": 8,
    "sSiC": 12,
    "Al2O3": 3,
    "Wheat": 6,
    "Nile rose": 2,
    "Carbon": 9,
    "IRON": 7,
    "Coconut shell": 4
}])

# Standardize and make prediction
new_input_scaled = scaler.transform(new_input)
predicted_wear_rate = model_augmented.predict(new_input_scaled)

# Display the prediction
print("Predicted Wear Rate:", predicted_wear_rate[0])


Predicted Wear Rate: 3.6175610455100293
