In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from snapml import SnapBoostingMachineRegressor
from sklearn.metrics import mean_squared_error
import joblib

# Load dataset
df = pd.read_csv("transformed_land_mines.csv")

# Prepare features and target
X = df.drop(columns=["M", "V"])
y = df["M"]

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# ---------------------- #
# Model 1: Random Forest #
# ---------------------- #
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_pred)
rf_cv_mse = -np.mean(cross_val_score(rf, X_train, y_train, cv=5, scoring='neg_mean_squared_error'))

print(f"[Random Forest] CV MSE: {rf_cv_mse:.4f}, Test MSE: {rf_mse:.4f}")
joblib.dump(rf, "random_forest_model.pkl")

# ----------------------- #
# Model 2: Extra Trees    #
# ----------------------- #
et = ExtraTreesRegressor(n_estimators=100, random_state=42)
et.fit(X_train, y_train)
et_pred = et.predict(X_test)
et_mse = mean_squared_error(y_test, et_pred)
et_cv_mse = -np.mean(cross_val_score(et, X_train, y_train, cv=5, scoring='neg_mean_squared_error'))

print(f"[Extra Trees] CV MSE: {et_cv_mse:.4f}, Test MSE: {et_mse:.4f}")
joblib.dump(et, "extra_trees_model.pkl")

# -------------------------------------- #
# Model 3: Snap Boosting Machine (IBM)   #
# -------------------------------------- #
snap = SnapBoostingMachineRegressor(num_round=100, learning_rate=0.1, max_depth=6, random_state=42)
snap.fit(X_train, y_train.values)  # Convert y_train to a NumPy array
snap_pred = snap.predict(X_test)
snap_mse = mean_squared_error(y_test, snap_pred)

# Manual cross-validation for SnapML
snap_cv_scores = []
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, val_idx in kf.split(X_train):
    X_cv_train, X_cv_val = X_train[train_idx], X_train[val_idx]
    y_cv_train, y_cv_val = y_train.values[train_idx], y_train.values[val_idx]  # Convert to NumPy arrays

    snap_cv = SnapBoostingMachineRegressor(
        num_round=100,
        learning_rate=0.1,
        max_depth=6,
        random_state=42
    )
    snap_cv.fit(X_cv_train, y_cv_train)
    snap_cv_pred = snap_cv.predict(X_cv_val)
    snap_cv_scores.append(mean_squared_error(y_cv_val, snap_cv_pred))

snap_cv_mse = np.mean(snap_cv_scores)
print(f"[Snap Boosting Machine] CV MSE: {snap_cv_mse:.4f}, Test MSE: {snap_mse:.4f}")
joblib.dump(snap, "snap_boosting_model.pkl")


[Random Forest] CV MSE: 1.5382, Test MSE: 1.3941
[Extra Trees] CV MSE: 1.6033, Test MSE: 1.2386
[Snap Boosting Machine] CV MSE: 1.7984, Test MSE: 1.7250


['snap_boosting_model.pkl']

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import accuracy_score, log_loss
import joblib

# Load dataset
df = pd.read_csv("transformed_land_mines.csv")

# Prepare features and target
X = df.drop(columns=["M", "V"])
y = df["V"]  # Assuming "V" is the target variable for classification

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train-validation-test sets
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Define the XGBoost classifier
xg_clf = xgb.XGBClassifier(
    objective='binary:logistic',  # Use 'multi:softprob' for multi-class classification
    learning_rate=0.1,
    max_depth=6,
    n_estimators=1000,
    random_state=42
)

# Train the model with early stopping
eval_set = [(X_val, y_val)]
xg_clf.fit(
    X_train, y_train,
    eval_set=eval_set,
    eval_metric='logloss',
    early_stopping_rounds=10,
    verbose=True
)

# Evaluate the model
xg_pred = xg_clf.predict(X_test)
xg_accuracy = accuracy_score(y_test, xg_pred)
xg_logloss = log_loss(y_test, xg_clf.predict_proba(X_test))

# Cross-validation
xg_cv_scores = []
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, val_idx in kf.split(X_train):
    X_cv_train, X_cv_val = X_train[train_idx], X_train[val_idx]
    y_cv_train, y_cv_val = y_train.values[train_idx], y_train.values[val_idx]

    xg_cv = xgb.XGBClassifier(
        objective='binary:logistic',
        learning_rate=0.1,
        max_depth=6,
        n_estimators=xg_clf.best_ntree_limit,  # Use the best number of trees found
        random_state=42
    )
    xg_cv.fit(X_cv_train, y_cv_train)
    xg_cv_pred = xg_cv.predict_proba(X_cv_val)
    xg_cv_scores.append(log_loss(y_cv_val, xg_cv_pred))

xg_cv_logloss = np.mean(xg_cv_scores)
print(f"[XGBoost Classifier with Early Stopping] CV Log Loss: {xg_cv_logloss:.4f}, Test Accuracy: {xg_accuracy:.4f}, Test Log Loss: {xg_logloss:.4f}")
joblib.dump(xg_clf, "xgboost_classifier_early_stopping_model.pkl")


TypeError: XGBClassifier.fit() got an unexpected keyword argument 'eval_metric'

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error
import joblib

# Load dataset
df = pd.read_csv("transformed_land_mines.csv")

# Prepare features and target
X = df.drop(columns=["M", "V"])
y = df["M"]

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train-validation-test sets
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Early stopping parameters
n_estimators = 100
tolerance = 5  # Number of iterations to wait before stopping
min_delta = 0.001  # Minimum change to qualify as an improvement

# Initialize variables for early stopping
best_score = np.inf
best_iteration = 0
patience = 0

# Train the ExtraTreesRegressor with early stopping
et = ExtraTreesRegressor(warm_start=True, random_state=42)

for i in range(1, n_estimators + 1):
    et.set_params(n_estimators=i)
    et.fit(X_train, y_train)

    # Predict on the validation set
    val_pred = et.predict(X_val)
    val_mse = mean_squared_error(y_val, val_pred)

    # Check for improvement
    if best_score - val_mse > min_delta:
        best_score = val_mse
        best_iteration = i
        patience = 0
    else:
        patience += 1

    # Early stopping condition
    if patience >= tolerance:
        print(f"Early stopping at iteration {i} with best iteration {best_iteration}")
        break

# Train the final model with the best number of estimators
et_final = ExtraTreesRegressor(n_estimators=best_iteration, random_state=42)
et_final.fit(X_train, y_train)

# Evaluate the final model
et_pred = et_final.predict(X_test)
et_mse = mean_squared_error(y_test, et_pred)

# Cross-validation
et_cv_scores = []
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, val_idx in kf.split(X_train):
    X_cv_train, X_cv_val = X_train[train_idx], X_train[val_idx]
    y_cv_train, y_cv_val = y_train.values[train_idx], y_train.values[val_idx]

    et_cv = ExtraTreesRegressor(n_estimators=best_iteration, random_state=42)
    et_cv.fit(X_cv_train, y_cv_train)
    et_cv_pred = et_cv.predict(X_cv_val)
    et_cv_scores.append(mean_squared_error(y_cv_val, et_cv_pred))

et_cv_mse = np.mean(et_cv_scores)
print(f"[Extra Trees with Early Stopping] CV MSE: {et_cv_mse:.4f}, Test MSE: {et_mse:.4f}")
joblib.dump(et_final, "extra_trees_early_stopping_model.pkl")


Early stopping at iteration 8 with best iteration 3
[Extra Trees with Early Stopping] CV MSE: 1.9768, Test MSE: 1.8322


['extra_trees_early_stopping_model.pkl']

In [9]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesClassifier, RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import accuracy_score, mean_squared_error
import joblib

# Load dataset
df = pd.read_csv("transformed_land_mines.csv")

# Prepare features and target
X = df.drop(columns=["M", "V"])
y_classification = df["V"].apply(lambda x: 1 if x > 0 else 0)  # Ensure binary classification
y_regression = df["M"]  # Target for regression

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train-validation-test sets
X_train, X_temp, y_train_class, y_temp_class = train_test_split(X_scaled, y_classification, test_size=0.3, random_state=42)
X_val, X_test, y_val_class, y_test_class = train_test_split(X_temp, y_temp_class, test_size=0.5, random_state=42)

X_train, X_temp, y_train_reg, y_temp_reg = train_test_split(X_scaled, y_regression, test_size=0.3, random_state=42)
X_val, X_test, y_val_reg, y_test_reg = train_test_split(X_temp, y_temp_reg, test_size=0.5, random_state=42)

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters for XGBoost Classifier
    xgb_params = {
        'objective': 'binary:logistic',
        'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('xgb_max_depth', 3, 10),
        'n_estimators': trial.suggest_int('xgb_n_estimators', 50, 500),
        'random_state': 42
    }

    dtrain = xgb.DMatrix(X_train, label=y_train_class)
    dval = xgb.DMatrix(X_val, label=y_val_class)
    evals = [(dtrain, 'train'), (dval, 'eval')]
    evals_result = {}

    xg_clf = xgb.train(
        xgb_params,
        dtrain,
        num_boost_round=xgb_params['n_estimators'],
        evals=evals,
        evals_result=evals_result,
        early_stopping_rounds=10,
        verbose_eval=False
    )

    xg_pred = xg_clf.predict(xgb.DMatrix(X_test))
    xg_pred_binary = [1 if pred > 0.5 else 0 for pred in xg_pred]
    xg_accuracy = accuracy_score(y_test_class, xg_pred_binary)

    # Suggest hyperparameters for Extra Trees Classifier
    et_params = {
        'n_estimators': trial.suggest_int('et_n_estimators', 50, 500),
        'max_depth': trial.suggest_int('et_max_depth', 3, 10),
        'random_state': 42
    }

    et_clf = ExtraTreesClassifier(**et_params)
    et_clf.fit(X_train, y_train_class)
    et_pred = et_clf.predict(X_test)
    et_accuracy = accuracy_score(y_test_class, et_pred)

    # Suggest hyperparameters for Random Forest Regressor
    rf_params = {
        'n_estimators': trial.suggest_int('rf_n_estimators', 50, 500),
        'max_depth': trial.suggest_int('rf_max_depth', 3, 10),
        'random_state': 42
    }

    rf_reg = RandomForestRegressor(**rf_params)
    rf_reg.fit(X_train, y_train_reg)
    rf_pred = rf_reg.predict(X_test)
    rf_mse = mean_squared_error(y_test_reg, rf_pred)

    # Combine metrics for optimization
    combined_metric = xg_accuracy + et_accuracy - rf_mse
    return combined_metric

# Create a study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Output the best trial
trial = study.best_trial
print("Best Trial:")
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# Save the best models
best_xgb_params = {
    'objective': 'binary:logistic',
    'learning_rate': trial.params['xgb_learning_rate'],
    'max_depth': trial.params['xgb_max_depth'],
    'n_estimators': trial.params['xgb_n_estimators'],
    'random_state': 42
}
best_xg_clf = xgb.train(
    best_xgb_params,
    xgb.DMatrix(X_train, label=y_train_class),
    num_boost_round=best_xgb_params['n_estimators']
)
joblib.dump(best_xg_clf, "best_xgboost_classifier.pkl")

best_et_params = {
    'n_estimators': trial.params['et_n_estimators'],
    'max_depth': trial.params['et_max_depth'],
    'random_state': 42
}
best_et_clf = ExtraTreesClassifier(**best_et_params)
best_et_clf.fit(X_train, y_train_class)
joblib.dump(best_et_clf, "best_extra_trees_classifier.pkl")

best_rf_params = {
    'n_estimators': trial.params['rf_n_estimators'],
    'max_depth': trial.params['rf_max_depth'],
    'random_state': 42
}
best_rf_reg = RandomForestRegressor(**best_rf_params)
best_rf_reg.fit(X_train, y_train_reg)
joblib.dump(best_rf_reg, "best_random_forest_regressor.pkl")


[I 2025-04-08 06:24:50,987] A new study created in memory with name: no-name-6cec9a82-7c3d-44e5-bc09-54512edf3974
  'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 0.01, 0.3),
Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()
[W 2025-04-08 06:24:50,992] Trial 0 failed with parameters: {'xgb_learning_rate': 0.14451110017480837, 'xgb_max_depth': 9, 'xgb_n_estimators': 457} because of the following error: XGBoostError('[06:24:50] /workspace/src/objective/./regression_loss.h:69: Check failed: base_score > 0.0f && base_score < 1.0f: base_score must be in (0,1) for logistic loss, got: 1\nStack trace:\n  [bt] (0) /workspaces/FDS_MiniProject/fds/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(+0x2a6acc) [0x769e700a6acc]\n  [bt] (1) /workspaces/FDS_MiniProject/fds/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(+0xed8329) [0x769e70cd8329]\n  [bt] (2) /workspaces/FDS_MiniProject/fds/lib/python3.12/site-packages/xgboost/l

[W 2025-04-08 06:24:50,997] Trial 0 failed with value None.


XGBoostError: [06:24:50] /workspace/src/objective/./regression_loss.h:69: Check failed: base_score > 0.0f && base_score < 1.0f: base_score must be in (0,1) for logistic loss, got: 1
Stack trace:
  [bt] (0) /workspaces/FDS_MiniProject/fds/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(+0x2a6acc) [0x769e700a6acc]
  [bt] (1) /workspaces/FDS_MiniProject/fds/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(+0xed8329) [0x769e70cd8329]
  [bt] (2) /workspaces/FDS_MiniProject/fds/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(+0x681223) [0x769e70481223]
  [bt] (3) /workspaces/FDS_MiniProject/fds/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(+0x6815ec) [0x769e704815ec]
  [bt] (4) /workspaces/FDS_MiniProject/fds/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(+0x68bb3b) [0x769e7048bb3b]
  [bt] (5) /workspaces/FDS_MiniProject/fds/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(XGBoosterUpdateOneIter+0x77) [0x769e6ffb6ba7]
  [bt] (6) /lib/x86_64-linux-gnu/libffi.so.7(+0x6ff5) [0x769ec02e0ff5]
  [bt] (7) /lib/x86_64-linux-gnu/libffi.so.7(+0x640a) [0x769ec02e040a]
  [bt] (8) /home/codespace/.python/current/lib/python3.12/lib-dynload/_ctypes.cpython-312-x86_64-linux-gnu.so(+0x13a46) [0x769ec0303a46]

