Training: XGBoost

In [1]:
import pandas as pd
import time
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

# Load dataset
df = pd.read_csv("transformed_land_mines.csv")

# Prepare data
X = df.drop(columns=["M","V"])
y = df["M"]-1

# Split into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBClassifier(
    eval_metric="logloss",  
    booster="dart",  # Dropout-based boosting  
    n_estimators=1000,  
    max_depth=10,  
    learning_rate=0.7,  
    subsample=0.8,  
    colsample_bytree=0.8,  
    reg_lambda=18,
    reg_alpha=0.6,  # L2 regularization term (adjust this value as needed)
)



# Train
start_time = time.time()
model.fit(X_train, y_train)
end_time = time.time()

# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"✅ XGBoost Accuracy: {accuracy:.4f}")
print(f"🕒 Training Time: {end_time - start_time:.2f} seconds")
print(classification_report(y_test, y_pred))

# Save model
joblib.dump(model, "xgboost_model.pkl")


✅ XGBoost Accuracy: 0.5294
🕒 Training Time: 52.80 seconds
              precision    recall  f1-score   support

           0       0.58      0.64      0.61        11
           1       0.92      0.92      0.92        12
           2       0.54      0.44      0.48        16
           3       0.30      0.55      0.39        11
           4       0.45      0.28      0.34        18

    accuracy                           0.53        68
   macro avg       0.56      0.56      0.55        68
weighted avg       0.55      0.53      0.53        68



['xgboost_model.pkl']

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import numpy as np
import joblib

# Load dataset
df = pd.read_csv("transformed_land_mines.csv")

# Prepare data
X = df.drop(columns=["M", "V"])
y = df["M"]

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Transform features into polynomial features
poly = PolynomialFeatures(degree=3)  # Experimenting with a higher degree
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Train polynomial regression model with regularization (Ridge regression)
ridge = Ridge(alpha=1.0)  # Regularization strength can be tuned
ridge.fit(X_train_poly, y_train)

# Cross-validation for better generalization
cv_scores = cross_val_score(ridge, X_train_poly, y_train, cv=5, scoring='neg_mean_squared_error')
cv_mse = -np.mean(cv_scores)

# Predict and evaluate
y_pred = ridge.predict(X_test_poly)
mse = mean_squared_error(y_test, y_pred)

print(f"Cross-Validation Mean Squared Error: {cv_mse:.4f}")
print(f"Test Mean Squared Error: {mse:.4f}")

# Save model and scaler
joblib.dump(ridge, "ridge_polynomial_regression_model.pkl")
joblib.dump(scaler, "scaler.pkl")


Cross-Validation Mean Squared Error: 2.1186
Test Mean Squared Error: 1.7505


['scaler.pkl']

In [None]:
import optuna
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np

# Load dataset
df = pd.read_csv("transformed_land_mines.csv")

# Prepare data
X = df.drop(columns=["M", "V"])
y = df["M"]

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    alpha = trial.suggest_loguniform('alpha', 1e-3, 10)  # Regularization strength
    degree = trial.suggest_int('degree', 2, 5)  # Polynomial degree

    # Transform features into polynomial features
    poly = PolynomialFeatures(degree=degree)
    X_train_poly = poly.fit_transform(X_train)
    
    # Ridge regression
    model = Ridge(alpha=alpha)
    
    # Cross-validation
    cv_scores = cross_val_score(
        model, X_train_poly, y_train, cv=5, scoring=make_scorer(mean_squared_error, greater_is_better=False)
    )
    return -np.mean(cv_scores)  # Minimize the negative MSE

# Run Optuna optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Best hyperparameters
print("Best hyperparameters:")
print(study.best_params)

# Train final model with best parameters
best_params = study.best_params
poly = PolynomialFeatures(degree=best_params['degree'])
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

final_model = Ridge(alpha=best_params['alpha'])
final_model.fit(X_train_poly, y_train)

# Evaluate on the test set
y_pred = final_model.predict(X_test_poly)
test_mse = mean_squared_error(y_test, y_pred)

print(f"Test Mean Squared Error: {test_mse:.4f}")


[I 2025-03-25 09:50:36,271] A new study created in memory with name: no-name-c65edbb6-daec-4041-9b56-aea56167abb2
  alpha = trial.suggest_loguniform('alpha', 1e-3, 10)  # Regularization strength
[I 2025-03-25 09:50:36,284] Trial 0 finished with value: 2.1022404827753 and parameters: {'alpha': 3.386731936774999, 'degree': 3}. Best is trial 0 with value: 2.1022404827753.
  alpha = trial.suggest_loguniform('alpha', 1e-3, 10)  # Regularization strength
[I 2025-03-25 09:50:36,307] Trial 1 finished with value: 2.1748679768540606 and parameters: {'alpha': 4.194861223459453, 'degree': 5}. Best is trial 0 with value: 2.1022404827753.
  alpha = trial.suggest_loguniform('alpha', 1e-3, 10)  # Regularization strength
[I 2025-03-25 09:50:36,327] Trial 2 finished with value: 2.124896453814168 and parameters: {'alpha': 0.23368617932740898, 'degree': 3}. Best is trial 0 with value: 2.1022404827753.
  alpha = trial.suggest_loguniform('alpha', 1e-3, 10)  # Regularization strength
[I 2025-03-25 09:50:36,3

Best hyperparameters:
{'alpha': 9.901804286039509, 'degree': 2}
Test Mean Squared Error: 1.7777


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib

# Load dataset
df = pd.read_csv("transformed_land_mines.csv")

# Prepare data
X = df.drop(columns=["M", "V"])
y = df["M"]

# Split into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train Random Forest Regressor
rf_model = RandomForestRegressor(
    n_estimators=100,  # Number of trees
    max_depth=10,      # Maximum depth of each tree
    min_samples_split=2,  # Minimum samples to split an internal node
    min_samples_leaf=1,   # Minimum samples at leaf nodes
    random_state=42
)
rf_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")

# Save model
joblib.dump(rf_model, "random_forest_model.pkl")


Mean Squared Error: 1.3462


['random_forest_model.pkl']

In [None]:
import pandas as pd
import optuna
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv("land_mines_dataset.csv")

# Prepare data
X = df.drop(columns=["M"])
y = df["M"] - 1

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define Optuna objective function for LightGBM
def objective(trial):
    # Suggest hyperparameters
    num_leaves = trial.suggest_int("num_leaves", 10, 100)
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    max_depth = trial.suggest_int("max_depth", -1, 50)  # -1 means no limit
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 1e-1)
    min_child_samples = trial.suggest_int("min_child_samples", 5, 50)
    subsample = trial.suggest_float("subsample", 0.5, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0)

    # Create and evaluate LightGBM model
    model = LGBMClassifier(
        num_leaves=num_leaves,
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        min_child_samples=min_child_samples,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        random_state=42
    )
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy")
    return scores.mean()  # Return mean cross-validation accuracy

# Create study and optimize
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# Print best hyperparameters
print("Best hyperparameters:", study.best_params)

# Train model with best hyperparameters
best_params = study.best_params
model = LGBMClassifier(**best_params, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Optimized LightGBM Accuracy: {accuracy:.4f}")

# Save model
import joblib
joblib.dump(model, "optimized_lightgbm.pkl")


In [9]:
import pandas as pd
import time
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Load dataset
df = pd.read_csv("transformed_land_mines.csv")

# Prepare data
X = df.drop(columns=["M", "V"])
y = df["M"] - 1

# Split into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize AdaBoost model with a base estimator
base_estimator = DecisionTreeClassifier(max_depth=7)  # Base weak learner
model = AdaBoostClassifier(
    estimator=base_estimator,
    n_estimators=500,
    learning_rate=0.8
)

# Train
start_time = time.time()
model.fit(X_train, y_train)
end_time = time.time()

# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"✅ AdaBoost Accuracy: {accuracy:.4f}")
print(f"🕒 Training Time: {end_time - start_time:.2f} seconds")
print(classification_report(y_test, y_pred))

# Save model
joblib.dump(model, "adaboost_model.pkl")


✅ AdaBoost Accuracy: 0.5735
🕒 Training Time: 0.95 seconds
              precision    recall  f1-score   support

           0       0.67      0.91      0.77        11
           1       0.92      0.92      0.92        12
           2       0.47      0.44      0.45        16
           3       0.35      0.55      0.43        11
           4       0.56      0.28      0.37        18

    accuracy                           0.57        68
   macro avg       0.59      0.62      0.59        68
weighted avg       0.58      0.57      0.56        68



['adaboost_model.pkl']

In [11]:
import pandas as pd
import time
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import PolynomialFeatures

# Load dataset
df = pd.read_csv("transformed_land_mines.csv")

# Prepare data
X = df.drop(columns=["M", "V"])
y = df["M"] - 1

# Split into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply Polynomial Feature Engineering (degree=2, only interactions)
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Apply SMOTE for class balancing
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_poly, y_train)

# Initialize XGBoost model with DART boosting and better hyperparameters
model = XGBClassifier(
    eval_metric="logloss",
    booster="gbtree",  # Dropout-based boosting
    n_estimators=300,  # More trees
    max_depth=7,  # Deeper trees
    learning_rate=0.03,  # Lower LR for better learning
    subsample=0.8,  # Use 80% of data per tree
    colsample_bytree=0.8,  # Use 80% of features per tree
)

# Train
start_time = time.time()
model.fit(X_train_resampled, y_train_resampled)
end_time = time.time()

# Evaluate
y_pred = model.predict(X_test_poly)
accuracy = accuracy_score(y_test, y_pred)

print(f"✅ XGBoost Accuracy: {accuracy:.4f}")
print(f"🕒 Training Time: {end_time - start_time:.2f} seconds")
print(classification_report(y_test, y_pred))

# Save model
joblib.dump(model, "xgboost_model.pkl")


✅ XGBoost Accuracy: 0.5441
🕒 Training Time: 0.40 seconds
              precision    recall  f1-score   support

           0       0.69      0.82      0.75        11
           1       0.85      0.92      0.88        12
           2       0.47      0.50      0.48        16
           3       0.33      0.45      0.38        11
           4       0.40      0.22      0.29        18

    accuracy                           0.54        68
   macro avg       0.55      0.58      0.56        68
weighted avg       0.53      0.54      0.53        68



['xgboost_model.pkl']

CatBoost

In [1]:
import pandas as pd
import time
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from catboost import CatBoostClassifier

# Load dataset
df = pd.read_csv("transformed_land_mines.csv")

# Prepare data
X = df.drop(columns=["M", "V"])
y = df["M"] - 1

# Split into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize model
model = CatBoostClassifier(iterations=250, eval_metric="Accuracy", verbose=50)

# Train
start_time = time.time()
model.fit(X_train, y_train)
end_time = time.time()

# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"✅ CatBoost Accuracy: {accuracy:.4f}")
print(f"🕒 Training Time: {end_time - start_time:.2f} seconds")
print(classification_report(y_test, y_pred))

# Save model
joblib.dump(model, "catboost_model.pkl")


Learning rate set to 0.238816
0:	learn: 0.5592593	total: 53.4ms	remaining: 13.3s
50:	learn: 0.8592593	total: 111ms	remaining: 432ms
100:	learn: 0.9703704	total: 157ms	remaining: 231ms
150:	learn: 0.9925926	total: 207ms	remaining: 135ms
200:	learn: 1.0000000	total: 254ms	remaining: 62ms
249:	learn: 1.0000000	total: 298ms	remaining: 0us
✅ CatBoost Accuracy: 0.5882
🕒 Training Time: 0.38 seconds
              precision    recall  f1-score   support

           0       0.71      0.91      0.80        11
           1       0.85      0.92      0.88        12
           2       0.53      0.50      0.52        16
           3       0.40      0.55      0.46        11
           4       0.45      0.28      0.34        18

    accuracy                           0.59        68
   macro avg       0.59      0.63      0.60        68
weighted avg       0.58      0.59      0.57        68



['catboost_model.pkl']

Logistic Regression

In [2]:
import pandas as pd
import time
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("transformed_land_mines.csv")

# Prepare data
X = df.drop(columns=["M", "V_log"])
y = df["M"] - 1  # Adjust target variable if necessary

# Split into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize Logistic Regression model
model = LogisticRegression(max_iter=1000, solver="lbfgs", multi_class="auto")

# Train
start_time = time.time()
model.fit(X_train, y_train)
end_time = time.time()

# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"✅ Logistic Regression Accuracy: {accuracy:.4f}")
print(f"🕒 Training Time: {end_time - start_time:.2f} seconds")
print(classification_report(y_test, y_pred))

# Save model
joblib.dump(model, "logistic_regression_model.pkl")
print("🎉 Model saved as 'logistic_regression_model.pkl'")


✅ Logistic Regression Accuracy: 0.4412
🕒 Training Time: 0.01 seconds
              precision    recall  f1-score   support

           0       0.52      0.87      0.65        15
           1       0.88      1.00      0.93        14
           2       0.25      0.15      0.19        13
           3       0.00      0.00      0.00        13
           4       0.08      0.08      0.08        13

    accuracy                           0.44        68
   macro avg       0.34      0.42      0.37        68
weighted avg       0.36      0.44      0.39        68

🎉 Model saved as 'logistic_regression_model.pkl'




Training: Random Forest

In [None]:
import pandas as pd
import time
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("transformed_land_mines.csv")

# Prepare data
X = df.drop(columns=["M","V"])
y = df["M"]-1

# Split into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize model
model = RandomForestClassifier(n_estimators=250)

# Train
start_time = time.time()
model.fit(X_train, y_train)
end_time = time.time()

# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"✅ RandomForest Accuracy: {accuracy:.4f}")
print(f"🕒 Training Time: {end_time - start_time:.2f} seconds")
print(classification_report(y_test, y_pred))

# Save model
joblib.dump(model, "randomforest_model.pkl")


✅ RandomForest Accuracy: 0.5294
🕒 Training Time: 0.29 seconds
              precision    recall  f1-score   support

           0       0.67      0.91      0.77        11
           1       0.85      0.92      0.88        12
           2       0.42      0.31      0.36        16
           3       0.38      0.55      0.44        11
           4       0.33      0.22      0.27        18

    accuracy                           0.53        68
   macro avg       0.53      0.58      0.54        68
weighted avg       0.50      0.53      0.51        68



['randomforest_model.pkl']

Training: Extra Trees

In [7]:
import pandas as pd
import time
import joblib
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("transformed_land_mines.csv")

# Prepare data
X = df.drop(columns=["M","V"])
y = df["M"]-1

# Split into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize model
model = lgb.LGBMClassifier(n_estimators=300)

# Train
start_time = time.time()
model.fit(X_train, y_train)
end_time = time.time()

# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"✅ LightGBM Accuracy: {accuracy:.4f}")
print(f"🕒 Training Time: {end_time - start_time:.2f} seconds")
print(classification_report(y_test, y_pred))

# Save model
joblib.dump(model, "lightgbm_model.pkl")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000021 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 90
[LightGBM] [Info] Number of data points in the train set: 270, number of used features: 3
[LightGBM] [Info] Start training from score -1.504077
[LightGBM] [Info] Start training from score -1.537979
[LightGBM] [Info] Start training from score -1.686399
[LightGBM] [Info] Start training from score -1.591089
[LightGBM] [Info] Start training from score -1.748274
✅ LightGBM Accuracy: 0.6176
🕒 Training Time: 0.18 seconds
              precision    recall  f1-score   support

           0       0.75      0.82      0.78        11
           1       0.85      0.92      0.88        12
           2       0.59      0.62      0.61        16
           3       0.38      0.45      0.42        11
           4       0.54      0.39      0.45        18

    accuracy                           0.62        68
   macro

['lightgbm_model.pkl']

Training: Extra Trees

In [8]:
import pandas as pd
import time
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("transformed_land_mines.csv")

# Prepare data
X = df.drop(columns=["M","V"])
y = df["M"]-1

# Split into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize model
model = ExtraTreesClassifier(n_estimators=250, random_state=42)

# Train
start_time = time.time()
model.fit(X_train, y_train)
end_time = time.time()

# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"✅ ExtraTrees Accuracy: {accuracy:.4f}")
print(f"🕒 Training Time: {end_time - start_time:.2f} seconds")
print(classification_report(y_test, y_pred))

# Save model
joblib.dump(model, "extratrees_model.pkl")


✅ ExtraTrees Accuracy: 0.4706
🕒 Training Time: 0.24 seconds
              precision    recall  f1-score   support

           0       0.62      0.91      0.74        11
           1       0.85      0.92      0.88        12
           2       0.25      0.19      0.21        16
           3       0.36      0.45      0.40        11
           4       0.23      0.17      0.19        18

    accuracy                           0.47        68
   macro avg       0.46      0.53      0.49        68
weighted avg       0.43      0.47      0.44        68



['extratrees_model.pkl']

Training: SVM

In [32]:
import pandas as pd
import time
import joblib
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("transformed_land_mines.csv")

# Prepare data
X = df.drop(columns=["M","V"])
y = df["M"]-1

# Split into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize model
model = SVC(kernel="rbf", probability=True)

# Train
start_time = time.time()
model.fit(X_train, y_train)
end_time = time.time()

# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"✅ SVM Accuracy: {accuracy:.4f}")
print(f"🕒 Training Time: {end_time - start_time:.2f} seconds")
print(classification_report(y_test, y_pred))

# Save model
joblib.dump(model, "svm_model.pkl")


✅ SVM Accuracy: 0.3676
🕒 Training Time: 0.02 seconds
              precision    recall  f1-score   support

           0       0.24      0.82      0.37        11
           1       1.00      0.92      0.96        12
           2       0.20      0.12      0.15        16
           3       0.50      0.09      0.15        11
           4       0.29      0.11      0.16        18

    accuracy                           0.37        68
   macro avg       0.44      0.41      0.36        68
weighted avg       0.42      0.37      0.33        68



['svm_model.pkl']

LightGBM with Optuna

In [None]:
import optuna
import lightgbm as lgb
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv("transformed_land_mines.csv")
# Prepare data
X = df.drop(columns=["M","V"])
y = df["M"]-1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define objective function
def objective(trial):
    params = {
        'objective': 'multiclass',
        'num_class': len(set(y)),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.03, 0.3),
        'num_leaves': trial.suggest_int('num_leaves', 50, 200),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
    }
    
    model = lgb.LGBMClassifier(**params)
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    return cross_val_score(model, X_train, y_train, cv=kf, scoring='accuracy').mean()

# Run Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Train final model
best_params = study.best_params
model = lgb.LGBMClassifier(**best_params)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(f"Final Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Best Parameters:", best_params)


XGBoost with Optuna

XGBoost-2

In [13]:
import optuna
import pandas as pd
import time
import joblib
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

# Load dataset
df = pd.read_csv("transformed_land_mines.csv")

# Prepare data
X = df.drop(columns=["M", "V"])
y = df["M"] - 1

# Split into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define Optuna objective function
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
    }

    model = XGBClassifier(eval_metric="logloss", **params)
    
    # Use Stratified K-Fold cross-validation
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    scores = []
    for train_idx, val_idx in kf.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        model.fit(X_train_fold, y_train_fold)
        y_pred_fold = model.predict(X_val_fold)
        scores.append(accuracy_score(y_val_fold, y_pred_fold))
    
    return sum(scores) / len(scores)

# Run Optuna hyperparameter tuning
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=25)  # Reduced trials for faster tuning

# Train final model with best parameters
best_params = study.best_params
model = XGBClassifier(eval_metric="logloss", **best_params)

start_time = time.time()
model.fit(X_train, y_train)
end_time = time.time()

# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"✅ XGBoost Accuracy: {accuracy:.4f}")
print(f"🕒 Training Time: {end_time - start_time:.2f} seconds")
print("Best Parameters:", best_params)
print(classification_report(y_test, y_pred))

# Save model
joblib.dump(model, "xgboost_optimized.pkl")


[I 2025-03-24 16:08:42,354] A new study created in memory with name: no-name-ec2f5ecb-3a96-4b91-86fd-629da7a7ce1b
[I 2025-03-24 16:08:42,946] Trial 0 finished with value: 0.47407407407407404 and parameters: {'n_estimators': 262, 'max_depth': 12, 'learning_rate': 0.08191234991587981, 'min_child_weight': 9, 'subsample': 0.5609289280168472, 'colsample_bytree': 0.9957820201645056, 'lambda': 0.0236145022073437, 'alpha': 2.0673503150017725e-08, 'booster': 'gbtree'}. Best is trial 0 with value: 0.47407407407407404.
[I 2025-03-24 16:08:46,072] Trial 1 finished with value: 0.4851851851851852 and parameters: {'n_estimators': 111, 'max_depth': 9, 'learning_rate': 0.04808861551365078, 'min_child_weight': 10, 'subsample': 0.8881956238107318, 'colsample_bytree': 0.8543781209700614, 'lambda': 0.05768598893849582, 'alpha': 2.3624256991243655e-07, 'booster': 'dart'}. Best is trial 1 with value: 0.4851851851851852.
[I 2025-03-24 16:08:57,194] Trial 2 finished with value: 0.5074074074074074 and parameter

✅ XGBoost Accuracy: 0.5147
🕒 Training Time: 0.23 seconds
Best Parameters: {'n_estimators': 299, 'max_depth': 10, 'learning_rate': 0.03946174871215509, 'min_child_weight': 5, 'subsample': 0.8992495783237072, 'colsample_bytree': 0.7879434563155175, 'lambda': 0.0005141790009235495, 'alpha': 0.6403997245259925, 'booster': 'gbtree'}
              precision    recall  f1-score   support

           0       0.71      0.80      0.75        15
           1       0.88      1.00      0.93        14
           2       0.15      0.15      0.15        13
           3       0.62      0.38      0.48        13
           4       0.14      0.15      0.15        13

    accuracy                           0.51        68
   macro avg       0.50      0.50      0.49        68
weighted avg       0.51      0.51      0.51        68



['xgboost_optimized.pkl']