In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from pathlib import Path

In [None]:
PROJECT_ROOT = Path.cwd().parent

DATA_DIR = PROJECT_ROOT / "data"
OUTPUT_DIR = PROJECT_ROOT / "output"

OUTPUT_DIR.mkdir(exist_ok=True)

In [None]:
data=pd.read_csv(OUTPUT_DIR / "final_train_data_selected.csv")
target_column= "Class"
X= data.drop(columns=[target_column])
y =data[target_column].map({"Benign": 0,"Trojan": 1})
X_train,X_val,y_train, y_val=train_test_split(X, y,test_size=0.2,stratify=y,random_state=42)

In [None]:
cv=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
scoring="roc_auc"
n_jobs=-1
random_state =42
best_params_list= []

In [None]:
print("\n logistic r best parameters")
lr =LogisticRegression(max_iter=5000,solver="saga",random_state=random_state)
lr_grid={"penalty": ["l1","l2"],"C": [0.01,0.1,1,10,100],}
lr_gs= GridSearchCV(lr, lr_grid,cv=cv,scoring=scoring,n_jobs=n_jobs, refit=True, verbose=1)
lr_gs.fit(X_train,y_train)

best_params_list.append({"model": "logistic regression","best_params": lr_gs.best_params_,"mean_cv_auc": lr_gs.best_score_})
print("best params:", lr_gs.best_params_)
print("best cv auc:",lr_gs.best_score_)

In [None]:
print("\n random forest best parameters")
rf=RandomForestClassifier(random_state=random_state, n_jobs=n_jobs)
rf_grid={"n_estimators": [200, 400,600],"max_depth": [None, 10, 20, 30],"min_samples_split": [2,5,10],
           "min_samples_leaf": [1,2, 4],"max_features": ["sqrt","log2"],}
rf_gs=GridSearchCV(rf,rf_grid,cv=cv,scoring=scoring,n_jobs=n_jobs,refit=True, verbose=1)
rf_gs.fit(X_train,y_train)

best_params_list.append({"model": "random forest","best_params": rf_gs.best_params_,"mean_cv_auc": rf_gs.best_score_})
print("bst params:", rf_gs.best_params_)
print("best cv auc:", rf_gs.best_score_)

In [None]:
print("\n xgb best parameters")

xgb= XGBClassifier(objective="binary:logistic",eval_metric="logloss",tree_method="hist",random_state=random_state,n_jobs=n_jobs,)
xgb_dist={"learning_rate": [0.02,0.03,0.05,0.07,0.1],"max_depth": [3,4, 5,6,8],"subsample": [0.6,0.7,0.8,0.9, 1.0],"colsample_bytree": [0.6,0.7,0.8,0.9,1.0],
          "min_child_weight": [1,3,5,7,10],"reg_lambda": [0.0,0.5,1.0,2.0,5.0],"gamma": [0, 0.1,0.3,0.5,1.0],"n_estimators": [200,400,600,800]}
xgb_rs=RandomizedSearchCV(xgb,xgb_dist,n_iter=40, cv=cv,scoring=scoring,n_jobs=n_jobs, random_state=random_state, refit=True, verbose=1)

xgb_rs.fit(X_train,y_train)

best_params_list.append({"model": "xgb","best_params": xgb_rs.best_params_,"mean_cv_auc": xgb_rs.best_score_})
print("best params:", xgb_rs.best_params_)
print("best cv auc:", xgb_rs.best_score_)

best_params_df=pd.DataFrame(best_params_list)
best_params_df.to_csv(OUTPUT_DIR / "best_model_params.csv", index=False)

print("search complete.")
print(best_params_df)