In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Define paths to your CSV datasets
dataset_path = "DATASET-balanced.csv"

# Read data from CSV files
df = pd.read_csv(dataset_path)
# Define desired size for datasets
target_size = 2000 

# Stratified sampling to maintain class balance
smaller_df = df.sample(target_size, random_state=42) 

# Combine features and labels from both datasets
features = smaller_df.drop("Classname", axis=1)
labels = smaller_df["Classname"]


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2)

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.fit_transform(y_test)


In [30]:

clf_1 = RandomForestClassifier( 
    n_estimators=100,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    bootstrap=False,
    oob_score=False,
    class_weight=None,
    ccp_alpha=0.0)


# Train the classifier using the training data
clf_1.fit(X_train, y_train_encoded)

# Make predictions on the test data
y_pred = clf_1.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test_encoded, y_pred)
report = classification_report(y_test_encoded, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

Accuracy: 0.98
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       196
           1       0.98      0.99      0.98       204

    accuracy                           0.98       400
   macro avg       0.98      0.98      0.98       400
weighted avg       0.98      0.98      0.98       400



In [31]:
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score, roc_curve, auc

# Calculate F1 score
f1 = f1_score(y_test_encoded, y_pred)
print(f"F1 Score: {f1:.2f}")

# Calculate EER (Equal Error Rate)
fpr, tpr, thresholds = roc_curve(y_test_encoded, clf_1.predict_proba(X_test)[:, 1])
eer = fpr[np.argmin(np.absolute((fpr - tpr)))]
print(f"EER: {eer:.2f}")

# Optionally, calculate AUC (Area Under the ROC Curve)
auc = roc_auc_score(y_test_encoded, y_pred)
print(f"AUC: {auc:.2f}")

tn1, fp1, fn1, tp1 = confusion_matrix(y_test_encoded, y_pred, labels=[0, 1]).ravel()
TDCF1 = 10*fp1 + fn1
print(f'TDCF for model: {TDCF1}')


F1 Score: 0.98
EER: 0.00
AUC: 0.98
TDCF for model: 52


In [32]:
clf_2 = GradientBoostingClassifier()


from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

y_test_encoded = le.fit_transform(y_test)
# Train the classifier using the training data
clf_2.fit(X_train, y_train_encoded)

# Make predictions on the test data
y_pred = clf_2.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test_encoded, y_pred)
report = classification_report(y_test_encoded, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

Accuracy: 0.96
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.96       196
           1       0.97      0.96      0.96       204

    accuracy                           0.96       400
   macro avg       0.96      0.96      0.96       400
weighted avg       0.96      0.96      0.96       400



In [33]:
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score, roc_curve, auc

# Calculate F1 score
f1 = f1_score(y_test_encoded, y_pred)
print(f"F1 Score: {f1:.2f}")

# Calculate EER (Equal Error Rate)
fpr, tpr, thresholds = roc_curve(y_test_encoded, clf_2.predict_proba(X_test)[:, 1])
eer = fpr[np.argmin(np.absolute((fpr - tpr)))]
print(f"EER: {eer:.2f}")

# Optionally, calculate AUC (Area Under the ROC Curve)
auc = roc_auc_score(y_test_encoded, y_pred)
print(f"AUC: {auc:.2f}")

tn1, fp1, fn1, tp1 = confusion_matrix(y_test_encoded, y_pred, labels=[0, 1]).ravel()
TDCF1 = 10*fp1 + fn1
print(f'TDCF for model: {TDCF1}')


F1 Score: 0.96
EER: 0.00
AUC: 0.96
TDCF for model: 79


In [34]:
clf_3 =CatBoostClassifier(verbose=0)

# Train the classifier using the training data
clf_3.fit(X_train, y_train_encoded)

# Make predictions on the test data
y_pred = clf_3.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test_encoded, y_pred)
report = classification_report(y_test_encoded, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

Accuracy: 0.99
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       196
           1       0.99      0.99      0.99       204

    accuracy                           0.99       400
   macro avg       0.99      0.99      0.99       400
weighted avg       0.99      0.99      0.99       400



In [35]:
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score, roc_curve, auc

# Calculate F1 score
f1 = f1_score(y_test_encoded, y_pred)
print(f"F1 Score: {f1:.2f}")

# Calculate EER (Equal Error Rate)
fpr, tpr, thresholds = roc_curve(y_test_encoded, clf_3.predict_proba(X_test)[:, 1])
eer = fpr[np.argmin(np.absolute((fpr - tpr)))]
print(f"EER: {eer:.2f}")

# Optionally, calculate AUC (Area Under the ROC Curve)
auc = roc_auc_score(y_test_encoded, y_pred)
print(f"AUC: {auc:.2f}")

tn1, fp1, fn1, tp1 = confusion_matrix(y_test_encoded, y_pred, labels=[0, 1]).ravel()
TDCF1 = 10*fp1 + fn1
print(f'TDCF for model: {TDCF1}')


F1 Score: 0.99
EER: 0.00
AUC: 0.99
TDCF for model: 32


In [36]:
# Convert the dataset into DMatrix format (required by XGBoost)
dtrain = xgboost.DMatrix(X_train, label=y_train_encoded)
dtest = xgboost.DMatrix(X_test, label=y_test_encoded)
params = {
    "objective": "multi:softmax",
    "num_class": 3,
    "max_depth": 3,
    "eta": 0.3
}
# Train the XGBoost model
num_round = 1000
clf_4 = xgboost.train(params,dtrain, num_round)

# Make predictions
y_pred = clf_4.predict(dtest)

# Evaluate accuracy
accuracy = accuracy_score(y_test_encoded, y_pred)
print(f"XGBoost Accuracy: {accuracy:.2f}")



XGBoost Accuracy: 0.98


In [37]:
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score, roc_curve, auc

# Calculate F1 score
f1 = f1_score(y_test_encoded, y_pred)
print(f"F1 Score: {f1:.2f}")

# Calculate EER (Equal Error Rate)
fpr, tpr, thresholds = roc_curve(y_test_encoded, clf_4.predict_proba(X_test)[:, 1])
eer = fpr[np.argmin(np.absolute((fpr - tpr)))]
print(f"EER: {eer:.2f}")

# Optionally, calculate AUC (Area Under the ROC Curve)
auc = roc_auc_score(y_test_encoded, y_pred)
print(f"AUC: {auc:.2f}")

tn1, fp1, fn1, tp1 = confusion_matrix(y_test_encoded, y_pred, labels=[0, 1]).ravel()
TDCF1 = 10*fp1 + fn1
print(f'TDCF for model: {TDCF1}')


F1 Score: 0.98
EER: 0.00
AUC: 0.98
TDCF for model: 53


In [38]:
from lightgbm import LGBMClassifier


clf_5 = LGBMClassifier()

# Train the classifier using the training data
clf_5.fit(X_train, y_train_encoded)

# Make predictions on the test data
y_pred = clf_5.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test_encoded, y_pred)
report = classification_report(y_test_encoded, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

Accuracy: 0.98
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       196
           1       0.98      0.99      0.98       204

    accuracy                           0.98       400
   macro avg       0.98      0.98      0.98       400
weighted avg       0.98      0.98      0.98       400



In [39]:
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score, roc_curve, auc

# Calculate F1 score
f1 = f1_score(y_test_encoded, y_pred)
print(f"F1 Score: {f1:.2f}")

# Calculate EER (Equal Error Rate)
fpr, tpr, thresholds = roc_curve(y_test_encoded, clf_5.predict_proba(X_test)[:, 1])
eer = fpr[np.argmin(np.absolute((fpr - tpr)))]
print(f"EER: {eer:.2f}")

# Optionally, calculate AUC (Area Under the ROC Curve)
auc = roc_auc_score(y_test_encoded, y_pred)
print(f"AUC: {auc:.2f}")

tn1, fp1, fn1, tp1 = confusion_matrix(y_test_encoded, y_pred, labels=[0, 1]).ravel()
TDCF1 = 10*fp1 + fn1
print(f'TDCF for model: {TDCF1}')


F1 Score: 0.98
EER: 0.00
AUC: 0.98
TDCF for model: 52


In [40]:
estimators = [
    ("RandomForest", clf_1),
    ("GradientBoost", clf_2),
    ("CatBoost", clf_3)#,
   # ("XGBoost",clf_4),
   # ("Lgbm",clf_5)
]

In [41]:
from sklearn.ensemble import BaggingClassifier, StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
# Create the ensembles
voting_clf = VotingClassifier(estimators=estimators, voting="soft")
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())  # Pass an instance of LogisticRegression
base_model = RandomForestClassifier()
bagging_clf = BaggingClassifier(base_model, n_estimators=10, random_state=42)



In [49]:
# Train the ensembles
voting_clf.fit(X_train, y_train_encoded)
stacking_clf.fit(X_train, y_train_encoded)
bagging_clf.fit(X_train, y_train_encoded)

# Make predictions
y_pred_voting = voting_clf.predict(X_test)
y_pred_stacking = stacking_clf.predict(X_test)
y_pred_bagging = bagging_clf.predict(X_test)

# Evaluate the accuracy of the ensemble
accuracy_voting = accuracy_score(y_test_encoded, y_pred_voting)
print(f"Voting (Soft) Ensemble Accuracy: {accuracy_voting:.2f}")
accuracy_stacking = accuracy_score(y_test_encoded, y_pred_stacking)
print(f"Stacking Ensemble Accuracy: {accuracy_stacking:.2f}")
accuracy_bagging = accuracy_score(y_test_encoded, y_pred_bagging)
print(f"Bagging Ensemble Accuracy: {accuracy_bagging:.2f}")

Voting (Soft) Ensemble Accuracy: 0.98
Stacking Ensemble Accuracy: 0.99
Bagging Ensemble Accuracy: 0.97


Voting (Soft) Ensemble Accuracy: 0.97
Stacking Ensemble Accuracy: 0.97
Bagging Ensemble Accuracy: 0.96

In [50]:
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score, roc_curve, auc

# Calculate F1 score
f1 = f1_score(y_test_encoded, y_pred_voting)
print(f"F1 Score: {f1:.2f}")

# Calculate EER (Equal Error Rate)
fpr, tpr, thresholds = roc_curve(y_test_encoded, voting_clf.predict_proba(X_test)[:, 1])
eer = fpr[np.argmin(np.absolute((fpr - tpr)))]
print(f"EER: {eer:.2f}")

# Optionally, calculate AUC (Area Under the ROC Curve)
auc = roc_auc_score(y_test_encoded, y_pred_voting)
print(f"AUC: {auc:.2f}")

tn1, fp1, fn1, tp1 = confusion_matrix(y_test_encoded, y_pred_voting, labels=[0, 1]).ravel()
TDCF1 = 10*fp1 + fn1
print(f'TDCF for model: {TDCF1}')


F1 Score: 0.99
EER: 0.00
AUC: 0.98
TDCF for model: 42


In [51]:
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score, roc_curve, auc

# Calculate F1 score
f1 = f1_score(y_test_encoded, y_pred_stacking)
print(f"F1 Score: {f1:.2f}")

# Calculate EER (Equal Error Rate)
fpr, tpr, thresholds = roc_curve(y_test_encoded, stacking_clf.predict_proba(X_test)[:, 1])
eer = fpr[np.argmin(np.absolute((fpr - tpr)))]
print(f"EER: {eer:.2f}")

# Optionally, calculate AUC (Area Under the ROC Curve)
auc = roc_auc_score(y_test_encoded, y_pred_stacking)
print(f"AUC: {auc:.2f}")

tn1, fp1, fn1, tp1 = confusion_matrix(y_test_encoded, y_pred_stacking, labels=[0, 1]).ravel()
TDCF1 = 10*fp1 + fn1
print(f'TDCF for model: {TDCF1}')


F1 Score: 0.99
EER: 0.00
AUC: 0.99
TDCF for model: 31


In [52]:
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score, roc_curve, auc

# Calculate F1 score
f1 = f1_score(y_test_encoded, y_pred_voting)
print(f"F1 Score: {f1:.2f}")

# Calculate EER (Equal Error Rate)
fpr, tpr, thresholds = roc_curve(y_test_encoded, bagging_clf.predict_proba(X_test)[:, 1])
eer = fpr[np.argmin(np.absolute((fpr - tpr)))]
print(f"EER: {eer:.2f}")

# Optionally, calculate AUC (Area Under the ROC Curve)
auc = roc_auc_score(y_test_encoded, y_pred_bagging)
print(f"AUC: {auc:.2f}")

tn1, fp1, fn1, tp1 = confusion_matrix(y_test_encoded, y_pred_bagging, labels=[0, 1]).ravel()
TDCF1 = 10*fp1 + fn1
print(f'TDCF for model: {TDCF1}')


F1 Score: 0.99
EER: 0.00
AUC: 0.97
TDCF for model: 67
