In [1]:
import os
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
import numpy as np

In [2]:
from utils.preprocess_data_KDD_plus_test import preprocess_data
from utils.plot import plot_roc_curve,plot_threshold_metrics, save_classification_reports_for_thresholds

In [3]:
train_file_path = os.path.join( 'data', 'KDDTest+_train.csv')
test_file_path = os.path.join('data', 'KDDTest+_test.csv')
preprocessing_models_folder = os.path.join('preprocessing_pipeline')

In [4]:
X_train,X_test,y_train,y_test = preprocess_data(
    train_file_path=train_file_path,
    test_file_path=test_file_path,
    preprocessing_models_folder=preprocessing_models_folder
)

In [5]:
X_train.shape,X_test.shape

((17980, 17), (4495, 17))

In [6]:
current_directory = os.path.abspath(os.getcwd())
output_folder = os.path.join(current_directory,"logistic_regression")

In [7]:
# Example usage in your main function
def train_logistic_regression(X_train, X_test, y_train, y_test, output_folder='output'):
    """Trains logistic regression model, makes predictions, generates reports, and plots ROC curves."""
    os.makedirs(output_folder, exist_ok=True)

    # Initialize and train the logistic regression model
    logreg = LogisticRegression(max_iter=1000)
    logreg.fit(X_train, y_train)

    # Save the trained model
    model_path = os.path.join(output_folder, "logistic_regression_model.pkl")
    joblib.dump(logreg, model_path)

    # Make predictions
    y_pred_train = logreg.predict(X_train)
    y_pred_test = logreg.predict(X_test)

    # Predicted probabilities for ROC curve
    y_prob_train = logreg.predict_proba(X_train)[:, 1]
    y_prob_test = logreg.predict_proba(X_test)[:, 1]

    # Classification reports and accuracy
    report_train = classification_report(y_train, y_pred_train)
    report_test = classification_report(y_test, y_pred_test)

    # Confusion matrix
    cm_train = confusion_matrix(y_train, y_pred_train)
    cm_test = confusion_matrix(y_test, y_pred_test)

    # Save reports to text files
    with open(os.path.join(output_folder, "train_report.txt"), "w") as f:
        f.write("Classification Report (Train):\n")
        f.write(report_train)
        f.write("\nConfusion Matrix (Train):\n")
        f.write(str(cm_train))

    with open(os.path.join(output_folder, "test_report.txt"), "w") as f:
        f.write("Classification Report (Test):\n")
        f.write(report_test)
        f.write("\nConfusion Matrix (Test):\n")
        f.write(str(cm_test))

    # ROC Curve for training data
    fpr_train, tpr_train, _ = roc_curve(y_train, y_prob_train)
    roc_auc_train = auc(fpr_train, tpr_train)
    plot_roc_curve(fpr_train, tpr_train, roc_auc_train, 'train', output_folder)

    # ROC Curve for test data
    fpr_test, tpr_test, _ = roc_curve(y_test, y_prob_test)
    roc_auc_test = auc(fpr_test, tpr_test)
    plot_roc_curve(fpr_test, tpr_test, roc_auc_test, 'test', output_folder)

    # Threshold analysis for train and test data
    thresholds = np.arange(0.1, 1.0, 0.05)

    # Calculate metrics for different thresholds
    train_precision_at_threshold = [np.mean(y_prob_train >= t) for t in thresholds]
    train_recall_at_threshold = [np.mean(y_train[y_prob_train >= t] == 1) for t in thresholds]
    test_precision_at_threshold = [np.mean(y_prob_test >= t) for t in thresholds]
    test_recall_at_threshold = [np.mean(y_test[y_prob_test >= t] == 1) for t in thresholds]

    # Plot Precision and Recall vs Threshold for train data
    plot_threshold_metrics(thresholds, train_precision_at_threshold, 'Precision', 'train', output_folder)
    plot_threshold_metrics(thresholds, train_recall_at_threshold, 'Recall', 'train', output_folder)

    # Plot Precision and Recall vs Threshold for test data
    plot_threshold_metrics(thresholds, test_precision_at_threshold, 'Precision', 'test', output_folder)
    plot_threshold_metrics(thresholds, test_recall_at_threshold, 'Recall', 'test', output_folder)

    # Save classification reports for different thresholds
    save_classification_reports_for_thresholds(thresholds, y_train, y_prob_train, output_folder, 'train')
    save_classification_reports_for_thresholds(thresholds, y_test, y_prob_test, output_folder, 'test')

    print(f"Logistic Regression model, reports, ROC curves, and threshold metrics saved in {output_folder}")

In [8]:
train_logistic_regression(X_train, X_test, y_train, y_test)

Logistic Regression model, reports, ROC curves, and threshold metrics saved in /Users/himanshupradhan/coding/Projects/Major Project/IDS/KDD+ Dataset/logistic_regression
