In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import webbrowser
import io
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    precision_score,
    f1_score,
    confusion_matrix,
)
import time

def load_and_combine_data(train_path, test_path):
    """Load and combine train and test datasets."""
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    df = pd.concat([test, train], ignore_index=True)
    # buffer = io.StringIO()
    # df.info(buf=buffer)
    # dataset_info = buffer.getvalue()
    # buffer.close()
    return df

def preprocess_data(df):
    """Preprocess the dataset (drop columns, handle outliers, and encode categorical variables)."""
    # Drop unnecessary columns
    list_drop = ["id", "attack_cat"]
    df.drop(list_drop, axis=1, inplace=True)
    
    # Log transformation for numeric features
    df_numeric = df.select_dtypes(include=[np.number])
    for feature in df_numeric.columns:
        if df_numeric[feature].nunique() > 50:
            if df_numeric[feature].min() == 0:
                df[feature] = np.log1p(df[feature])
            else:
                df[feature] = np.log(df[feature])

    # Process categorical features
    df_cat = df.select_dtypes(exclude=[np.number])
    for feature in df_cat.columns:
        if df_cat[feature].nunique() > 6:
            df[feature] = np.where(
                df[feature].isin(df[feature].value_counts().head().index), 
                df[feature], "-"
            )
    return df

def encode_and_split(df):
    """Encode categorical features and split into training and testing datasets."""
    # Features and target
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    # One-hot encoding for categorical features
    categorical_columns = [1, 2, 3]
    ct = ColumnTransformer(
        transformers=[("encoder", OneHotEncoder(), categorical_columns)], 
        remainder="passthrough"
    )
    X = np.array(ct.fit_transform(X))

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0, stratify=y
    )

    # Scale numeric features (skip first 17 categorical columns)
    sc = StandardScaler()
    X_train[:, 18:] = sc.fit_transform(X_train[:, 18:])
    X_test[:, 18:] = sc.transform(X_test[:, 18:])
    
    return X_train, X_test, y_train, y_test

def feature_selection(X, y):
    """Select the best features using SelectKBest and chi2."""
    best_features = SelectKBest(score_func=chi2, k='all')
    fit = best_features.fit(X, y)

    # Create a DataFrame for feature scores
    df_scores = pd.DataFrame(fit.scores_)
    df_col = pd.DataFrame(X.columns)
    feature_score = pd.concat([df_col, df_scores], axis=1)
    feature_score.columns = ['feature', 'score']
    feature_score.sort_values(by=['score'], ascending=True, inplace=True)

    # Plot the top 20 features
    fig = go.Figure(go.Bar(
        x=feature_score['feature'][0:21],
        y=feature_score['score'][0:21],
        
        orientation='h'
    ))
    
    fig.update_layout(
    title="Top 20 Features by Score",
    xaxis_title="Features",
    height=500,
    yaxis_title="Scores",
    xaxis_tickangle=45  # Tilt feature names for better visibility
    )
   


    # Save the figure as an image
    fig.write_image("features.png")
    fig.write_image("top_20_features.html")
    return feature_score, fig

def train_and_evaluate_model(X_train, X_test, y_train, y_test):
    """Train the Random Forest model and evaluate its performance."""
    start = time.time()
    model = RandomForestClassifier(
        n_estimators=100, n_jobs=-1, random_state=0, bootstrap=True
    ).fit(X_train, y_train)
    end_train = time.time()

    y_predictions = model.predict(X_test)
    end_predict = time.time()

    # Metrics
    accuracy = accuracy_score(y_test, y_predictions)*100
    recall = recall_score(y_test, y_predictions, average="weighted")*100
    precision = precision_score(y_test, y_predictions, average="weighted")*100
    f1s = f1_score(y_test, y_predictions, average="weighted")*100

    print(f"\nModel Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"F1-Score: {f1s:.4f}")
    print(f"Training Time: {end_train - start:.2f}s")
    print(f"Prediction Time: {end_predict - end_train:.2f}s")
    print(f"Total Time: {end_predict - start:.2f}s")

    return model, y_predictions, accuracy, recall, precision, f1s

def calculate_far_and_confusion_matrix(y_test, y_predictions):
    """Calculate False Alarm Rate (FAR) and display confusion matrix."""
    cm = confusion_matrix(y_test, y_predictions)
    tn, fp, fn, tp = cm.ravel()

    # Calculate FAR
    far = (fp / (fp + tn))*100
    print(f"\nFalse Alarm Rate (FAR): {far:.4f}")

    # Return FAR and confusion matrix
    return far, cm

def save_html_report(metrics, confusion_matrix, far,dataset_info,df):
    """Save metrics and confusion matrix to an HTML file."""
    accuracy, recall, precision, f1s = metrics

    # Save confusion matrix as an image
    plt.figure(figsize=(6, 4))
    sns.heatmap(
        confusion_matrix, annot=True, fmt="d", cmap="Blues", 
        xticklabels=["False", "True"], yticklabels=["False", "True"]
    )
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.savefig("confusion_matrix.png")
    plt.close()

    num_rows, num_columns = df.shape
    dataset_shape_info = f"Dataset contains {num_rows} rows and {num_columns} columns."

    datatype_counts = df.dtypes.value_counts().to_frame()
    datatype_html = datatype_counts.to_html()
    
    
    with open("report.html", "w") as f:
        f.write(f"""
        <!DOCTYPE html>
        <html lang="en">
        <head>
            <meta charset="UTF-8">
            <meta name="viewport" content="width=device-width, initial-scale=1.0">
            <title>Model Performance Report</title>
            <style>
                body {{ font-family: Arial, sans-serif; margin: 20px; background-color: #f9f9f9; color: #333; }}
                h1, h2 {{ color: #2c3e50; }}
                .container {{
                    display: flex;
                    flex-wrap: wrap;
                    justify-content: space-around;
                    gap: 20px;
                }}
                .box {{
                    background-color: #ffffff;
                    border: 1px solid #ddd;
                    border-radius: 8px;
                    padding: 20px;
                    box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1);
                    width: 45%;
                    min-width: 300px;
                    box-sizing: border-box;
                }}
                .box img {{
                    max-width: 100%;
                    height: auto;
                    display: block;
                    margin: 0 auto;
                }}
                ul {{ list-style-type: none; padding: 0; }}
                ul li {{ padding: 5px 0; }}
                ul li strong {{ color: #16a085; }}
                .header {{
                    text-align: center;
                    padding: 10px;
                    background-color: #34495e;
                    color: white;
                    border-radius: 8px 8px 0 0;
                }}
            </style>
        </head>
        <body>
            <h1 style="text-align: 20px;">Model Performance Report</h1>
            <div class="container">
                <!-- Dataset Info -->
                <div class="box">
                    <div class="header">Dataset Info</div>
                    <p><strong>Shape:</strong> {dataset_shape_info}</p>
                    <h3>Column Data Types</h3>
                    {datatype_html}
                    <img src="attack.png" alt="9 attack categories">
                </div>

                <!-- Feature Selection -->
                <div class="box">
                    <div class="header">Feature Selection</div>
                    <h3>Top 20 Features</h3>
                    <img src="features.png" alt="Top 20 Features">
                </div>

                <!-- Metrics -->
                <div class="box">
                    <div class="header">Model Metrics</div>
                    <ul>
                        <li><strong>Accuracy:</strong> {accuracy:.4f}</li>
                        <li><strong>Recall:</strong> {recall:.4f}</li>
                        <li><strong>Precision:</strong> {precision:.4f}</li>
                        <li><strong>F1-Score:</strong> {f1s:.4f}</li>
                        <li><strong>False Alarm Rate (FAR):</strong> {far:.4f}</li>
                    </ul>
                </div>

                <!-- Confusion Matrix -->
                <div class="box">
                    <div class="header">Confusion Matrix</div>
                    <img src="confusion_matrix.png" alt="Confusion Matrix">
                </div>
            </div>
        </body>
        </html>
        """)

def main():
    # Load and preprocess the dataset
    print("Loading data...")
    df = load_and_combine_data("UNSW_NB15_training-set.csv", "UNSW_NB15_testing-set.csv")
    # Capture dataset info
    buffer = io.StringIO()
    df.info(buf=buffer)
    dataset_info = buffer.getvalue()
    buffer.close()
    
    
    print("Preprocessing data...")
    df = preprocess_data(df)
    
    
    # Encode features and split data
    print("Encoding and splitting data...")
    X_train, X_test, y_train, y_test = encode_and_split(df)

    # Train and evaluate the model
    print("Training and evaluating model...")
    model, y_predictions, accuracy, recall, precision, f1s = train_and_evaluate_model(
        X_train, X_test, y_train, y_test
    )

    # Calculate FAR and confusion matrix
    print("Evaluating performance...")
    far, cm = calculate_far_and_confusion_matrix(y_test, y_predictions)

    # Save HTML report
    print("Saving report...")
    save_html_report((accuracy, recall, precision, f1s), cm, far,dataset_info,df)
    print("Report saved as 'report.html'.")
    webbrowser.open("report.html")

if __name__ == "__main__":
    main()


Loading data...
Preprocessing data...
Encoding and splitting data...
Training and evaluating model...

Model Performance:
Accuracy: 95.0616
Recall: 95.0616
Precision: 95.0763
F1-Score: 95.0674
Training Time: 24.37s
Prediction Time: 0.27s
Total Time: 24.64s
Evaluating performance...

False Alarm Rate (FAR): 6.2957
Saving report...
Report saved as 'report.html'.
