In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import joblib
from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import GridSearchCV


In [6]:
##" Import dataset and split for training"

# Step 2: Load combined features and labels
X = np.load('../data/features_combined.npy')
y = pd.read_csv('../data/labels.csv')  # assuming binary classification labels (0 or 1)

# Step 3: Handle missing values (NaNs)
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Step 4: Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 5: Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.75, random_state=42, stratify=y
)

In [5]:
# --- Random Forest hyperparameter grid ---
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=42)
rf_grid_search = GridSearchCV(
    estimator=rf,
    param_grid=rf_param_grid,
    scoring='accuracy',
    cv=3,
    n_jobs=-1,
    verbose=2
)

rf_grid_search.fit(X_train, y_train)
print("Best Random Forest hyperparameters:", rf_grid_search.best_params_)


Fitting 3 folds for each of 810 candidates, totalling 2430 fits


810 fits failed out of a total of 2430.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
411 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\shadr\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\shadr\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "c:\Users\shadr\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\shadr\AppData\Local\Programs\Python\Python312\

Best Random Forest hyperparameters: {'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}


In [6]:
best_rf = rf_grid_search.best_estimator_
rf_preds = best_rf.predict(X_test)
print("Random Forest optimized results:")
print("Accuracy:", accuracy_score(y_test, rf_preds))
print(classification_report(y_test, rf_preds))

joblib.dump(best_rf, "../models/random_forest_optimized.pkl")

Random Forest optimized results:
Accuracy: 0.801292861157954
              precision    recall  f1-score   support

           0       0.78      0.84      0.81     12304
           1       0.83      0.76      0.80     12602

    accuracy                           0.80     24906
   macro avg       0.80      0.80      0.80     24906
weighted avg       0.80      0.80      0.80     24906



['../models/random_forest_optimized.pkl']

In [7]:
import joblib
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score


# Load the model from the file
loaded_model = joblib.load("../models/random_forest_optimized.pkl")

# Now you can use the loaded model to make predictions
rf_preds = loaded_model.predict(X_test)
rf_probs = loaded_model.predict_proba(X_test)[:, 1]  # Probability for positive class

print(rf_preds)

print("Random Forest optimized results:")
print("Accuracy:", accuracy_score(y_test, rf_preds))
print("AUC (AUROC):", roc_auc_score(y_test, rf_probs))
print(classification_report(y_test, rf_preds))


[0 0 0 ... 0 0 0]
Random Forest optimized results:
Accuracy: 0.801292861157954
AUC (AUROC): 0.8832673595424922
              precision    recall  f1-score   support

           0       0.78      0.84      0.81     12304
           1       0.83      0.76      0.80     12602

    accuracy                           0.80     24906
   macro avg       0.80      0.80      0.80     24906
weighted avg       0.80      0.80      0.80     24906



In [None]:

# For RandomForest
rf_preds = best_rf.predict(X_test)
rf_probs = best_rf.predict_proba(X_test)[:, 1]  # Probability for positive class

print("Random Forest optimized results:")
print("Accuracy:", accuracy_score(y_test, rf_preds))
print("AUC (AUROC):", roc_auc_score(y_test, rf_probs))
print(classification_report(y_test, rf_preds))

In [11]:
# --- XGBoost hyperparameter randomsearch


xgb_param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'gamma': [0, 0.1, 0.3],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}




# Step 3: Initialize XGBClassifier
xgb = XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)


xgb_random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=xgb_param_dist,
    n_iter=20,               # number of parameter settings to try
    scoring='accuracy',
    cv=3,
    n_jobs=-1,
    verbose=2,
    random_state=42
)
# Step 5: Fit RandomizedSearchCV on the smaller subset
xgb_random_search.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [12]:


# Step 6: Output best hyperparameters
print("Best XGBoost hyperparameters found:")
print(xgb_random_search.best_params_)

# Step 7: Evaluate best model on the full test set
best_xgb = xgb_random_search.best_estimator_
xgb_preds = best_xgb.predict(X_test)
xgb_probs = best_xgb.predict_proba(X_test)[:, 1]



Best XGBoost hyperparameters found:
{'subsample': 0.7, 'reg_lambda': 1, 'reg_alpha': 0.1, 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 1.0}


In [13]:
print("XGBoost optimized results on test set:")
print("Accuracy:", accuracy_score(y_test, xgb_preds))
print("AUC (AUROC):", roc_auc_score(y_test, xgb_probs))
print(classification_report(y_test, xgb_preds))

# Step 8: Save the best model
joblib.dump(best_xgb, "../models/xgboost_optimized.pkl")

XGBoost optimized results on test set:
Accuracy: 0.7754757889665141
AUC (AUROC): 0.8543059989394215
              precision    recall  f1-score   support

           0       0.76      0.80      0.78     12304
           1       0.79      0.76      0.77     12602

    accuracy                           0.78     24906
   macro avg       0.78      0.78      0.78     24906
weighted avg       0.78      0.78      0.78     24906



['../models/xgboost_optimized.pkl']