In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import (roc_auc_score,roc_curve,auc,accuracy_score,f1_score,precision_score,recall_score,confusion_matrix)
import joblib
from pathlib import Path

In [None]:
PROJECT_ROOT = Path.cwd().parent

DATA_DIR = PROJECT_ROOT / "data"
OUTPUT_DIR = PROJECT_ROOT / "output"

OUTPUT_DIR.mkdir(exist_ok=True)

In [None]:
data =pd.read_csv(OUTPUT_DIR / "final_train_data_selected.csv")

print("Class distribution:\n", data["Class"].value_counts())
print("\npercentage of each class:\n", data["Class"].value_counts(normalize=True) * 100)

plt.figure(figsize=(5,4))
sns.countplot(data=data,x="Class",palette="crest")
plt.title("class distribution in dataset")
plt.xlabel("class label")
plt.ylabel("number of samples")
plt.tight_layout()
plt.show()

In [None]:
data =pd.read_csv(OUTPUT_DIR / "final_train_data_selected.csv")
target_column="Class"
X=data.drop(columns=[target_column])
y=data[target_column].map({"Benign": 0,"Trojan":1})

In [None]:
#feature names
features=data.drop(columns=[target_column])
feature_names=features.columns.tolist()

print(f"total features: {len(feature_names)}")

feature_df= pd.DataFrame({"Feature": feature_names, "Count": [1]*len(feature_names)})

plt.figure(figsize=(10, 20))  
sns.barplot(data=feature_df,x="Count",y="Feature",palette="crest")
plt.title("all features in train_data.csv")
plt.xlabel("count (each=1 feature)")
plt.ylabel("feature name")
plt.tight_layout()
plt.show()

In [None]:
#stratified train-test split
X_train,X_test,y_train, y_test =train_test_split(X, y,test_size=0.2,stratify=y,random_state=42)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

In [None]:
#lr=LogisticRegression(max_iter=1000,random_state=42)
#rf=RandomForestClassifier(n_estimators=200,random_state=42, n_jobs=-1)
#xgb=XGBClassifier(n_estimators=300,learning_rate=0.05,max_depth=6,subsample=0.8, colsample_bytree=0.8,eval_metric="logloss", random_state=42)

In [None]:
lr=LogisticRegression(C=0.01, penalty='l2',max_iter=1000,solver='saga',random_state=42)

rf=RandomForestClassifier(n_estimators=600,max_depth=30,min_samples_split=10,min_samples_leaf=1,
                          max_features='sqrt',random_state=42,n_jobs=-1)

xgb=XGBClassifier(n_estimators=800,learning_rate=0.1,max_depth=8,subsample=0.9,olsample_bytree=0.7,
                  min_child_weight=10,reg_lambda=5.0,gamma=0.1,eval_metric="logloss",tree_method="hist", random_state=42,n_jobs=-1)

In [None]:
models ={"Logistic regression": lr,"Random forest": rf,"XGBoost": xgb}

In [None]:
#stratified 5-fold CV
cv=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
results_cv=[]

for name,model in models.items():
    cv_auc=cross_val_score(model,X_train,y_train,cv=cv,scoring="roc_auc",n_jobs=-1)
    mean_auc=cv_auc.mean()
    std_auc=cv_auc.std()
    results_cv.append([name,mean_auc,std_auc])
    print(f"{name}: cv mean auc= {mean_auc:.4f}±{std_auc:.3f}")

results_cv_df = pd.DataFrame(results_cv, columns=["model","mean_cv_auc","std"])
#results_cv_df.to_csv(OUTPUT_DIR / "cv_auc_results.csv", index=False)
print("\ncross-validation auc summary:")
print(results_cv_df)

In [None]:
plt.figure(figsize=(6,4))
sns.barplot(data=results_cv_df, x="mean_cv_auc", y="model", palette="crest", orient="h")
plt.title("mean cross-validation auc for each model")
plt.xlabel("mean roc-auc (5-fold cv)")
plt.tight_layout()
plt.show()

In [None]:
#fit & evaluate models
metrics = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_val = roc_auc_score(y_test, y_prob)

    metrics.append([name, acc, prec, rec, f1, auc_val])
    print(f"\n{name} – accuracy: {acc:.4f}, f1: {f1:.4f}, roc-auc: {auc_val:.4f}")

    model_path = OUTPUT_DIR / f"{name.replace(' ', '_').lower()}_best_model.joblib"
    joblib.dump(model, model_path)

    print(f"Saved {name} model to {model_path}")


In [None]:
#create summary df
metrics_df=pd.DataFrame(metrics, columns=["model","accuracy","precision","recall","f1_score","roc_auc"])
#metrics_df.to_csv(OUTPUT_DIR / "detailed_model_metrics.csv",index=False)
print(metrics_df)

In [None]:
#roc curves for all models
plt.figure(figsize=(6,5))
for name, model in models.items():
    y_prob= model.predict_proba(X_test)[:,1]
    fpr,tpr, _=roc_curve(y_test,y_prob)
    roc_auc= auc(fpr, tpr)
    plt.plot(fpr,tpr,lw=2,label=f"{name} (AUC={roc_auc:.3f})")

plt.plot([0,1],[0,1],'k--', lw=1)
plt.xlabel("false positive rate")
plt.ylabel("true positive rate")
plt.title("roc curves of logistic regression,random forest,and xgboost")
plt.legend(loc="lower right")
plt.tight_layout()
plt.show()

In [None]:
#confusion matrices for all models
for name, model in models.items():
    y_pred= model.predict(X_test)
    cm= confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(4,3))
    sns.heatmap(cm, annot=True, fmt='d',cmap='crest', cbar=False)
    plt.title(f"confusion matrix–{name.lower()}")
    plt.xlabel("predicted")
    plt.ylabel("actual")
    plt.tight_layout()
    plt.show()

In [None]:
#f1/acc/auc comparison 
plt.figure(figsize=(7,4))
metrics_melted = metrics_df.melt(id_vars="model",value_vars=["accuracy","precision","recall","f1_score"])
sns.barplot(data=metrics_melted, x="value",y="model",hue="variable",palette="crest")
plt.xlabel("score value")
plt.ylabel("model")
plt.title("comparison of accuracy, f1-score, and roc-auc across models")
plt.legend(title="")
plt.tight_layout()
plt.show()