In [None]:
from sklearn.feature_selection import SelectFromModel,VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, RepeatedStratifiedKFold, cross_val_score, train_test_split, learning_curve
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_curve, auc, confusion_matrix)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import joblib
from sklearn.tree import export_text, plot_tree



# Load and prepare data
df = pd.read_csv('Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv')
print(df.shape)

# Split features and target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
y = y.replace({'BENIGN': 0, 'DDoS': 1})
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(X.mean(), inplace=True)

# Compute variances
variances = X.var()

# Visualize variances
plt.figure(figsize=(10, 6))
plt.hist(variances, bins=50, edgecolor='k')
plt.title("Feature Variance Distribution")
plt.xlabel("Variance")
plt.ylabel("Number of Features")
plt.grid()
plt.show()

threshold = 0.1
selector_variance = VarianceThreshold(threshold=threshold)
X_reduced = selector_variance.fit_transform(X)
print(f"Number of features selected after variance thresholding: {X_reduced.shape[1]}")

# Recompute and plot variances of reduced features
reduced_variances = pd.DataFrame(X_reduced).var()

plt.figure(figsize=(10, 6))
plt.bar(range(len(reduced_variances)), reduced_variances)
plt.title("Variance of Features After Further Reduction")
plt.xlabel("Features (Index)")
plt.ylabel("Variance")
plt.show()


# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_reduced)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42)

def apply_variance_threshold(X, feature_names, threshold=0.01):
    selector = VarianceThreshold(threshold=threshold)
    X_reduced = selector.fit_transform(X)
    selected_features = [feature_names[i] for i in range(len(feature_names)) if selector.get_support()[i]]
    
    print(f"Reduced features from {X.shape[1]} to {X_reduced.shape[1]} using VarianceThreshold")
    
    # Plot variance of features after applying variance threshold
    feature_variances = X.var()
    plt.figure(figsize=(12, 6))
    plt.bar(range(len(feature_names)), feature_variances, label="Original Features", alpha=0.5)
    selected_variances = [feature_variances[i] for i in range(len(feature_names)) if selector.get_support()[i]]
    plt.bar(range(len(selected_features)), selected_variances, label="Selected Features", alpha=0.8, color='orange')
    plt.title('Feature Variances After Variance Threshold')
    plt.xlabel('Feature Index')
    plt.ylabel('Variance')
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    return X_reduced, selector, selected_features

def select_features(X, y, feature_names, threshold='median'):
    selector = SelectFromModel(
        RandomForestClassifier(n_estimators=50, random_state=42), 
        threshold=threshold
    )
    selector.fit(X, y)
    selected_features = [feature_names[i] for i in range(len(feature_names)) if selector.get_support()[i]]
    
    print(f"Selected {len(selected_features)} features out of {X.shape[1]} using threshold={threshold}")
    
    # Plot feature importances for selected features
    feature_importances = selector.estimator_.feature_importances_
    selected_importances = [feature_importances[i] for i in range(len(feature_names)) if selector.get_support()[i]]
    plt.figure(figsize=(12, 6))
    plt.bar(range(len(selected_features)), selected_importances, label="Feature Importances", color='skyblue')
    plt.title('Feature Importances After Feature Selection')
    plt.xlabel('Selected Feature Index')
    plt.ylabel('Importance')
    plt.tight_layout()
    plt.show()
    
    return selector, selected_features


def train_improved_model(X, y, feature_names, threshold='median', n_iter=20, cv=5):
    param_dist = {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
    
    X_sample, _, y_sample, _ = train_test_split(X, y, train_size=0.3, random_state=42, stratify=y)
    X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    selector, selected_feature_names = select_features(X_train_scaled, y_train, feature_names, threshold=threshold)
    X_train_selected = selector.transform(X_train_scaled)
    X_test_selected = selector.transform(X_test_scaled)
    
    rf = RandomForestClassifier(random_state=42, oob_score=True, n_jobs=-1)
    
    random_search = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist,
        n_iter=n_iter,
        cv=cv,
        scoring='f1',
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    random_search.fit(X_train_selected, y_train)
    
    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test_selected)
    
    print("\nBest Parameters:", random_search.best_params_)
    print("\nMean CV F1 Score:", random_search.best_score_)
    print("\nTest Set Metrics:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("\nOut-of-bag score:", best_model.oob_score_)
    
    return best_model, selector, scaler, selected_feature_names

def plot_feature_importance(model, feature_names, top_n=None):
    # Extract feature importances from the model
    importances = model.feature_importances_

    # Check for mismatched lengths
    if len(feature_names) != len(importances):
        raise ValueError("Mismatch between feature names and importances lengths.")

    # If top_n is not specified, use all available features
    if top_n is None:
        top_n = len(feature_names)
    
    # Sort importances in descending order and get the top_n indices
    indices = np.argsort(importances)[::-1][:top_n]
    sorted_importances = importances[indices]
    sorted_feature_names = [feature_names[i] for i in indices]

    # Plot the feature importances
    plt.figure(figsize=(10, 6))
    plt.bar(range(len(sorted_importances)), sorted_importances, align='center')
    plt.xticks(range(len(sorted_importances)), sorted_feature_names, rotation=45, ha='right')
    plt.title('Top Feature Importances')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.tight_layout()
    plt.show()



def save_feature_names(feature_names, file_path='selected_features.txt'):
    with open(file_path, 'w') as file:
        for feature in feature_names:
            file.write(feature + '\n')
    print(f"Selected feature names saved to '{file_path}'")

def main(X, y, feature_names):
    X_reduced, variance_selector, reduced_feature_names = apply_variance_threshold(X, feature_names, threshold=0.01)
    
    best_model, selector, scaler, selected_feature_names = train_improved_model(
        X_reduced, y, reduced_feature_names, 
        threshold='mean', n_iter=20, cv=5
    )
    
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
    robust_cv_scores = cross_val_score(
        best_model, 
        selector.transform(scaler.transform(X_reduced)),
        y, 
        cv=cv, 
        scoring='f1'
    )
    
    print("\nRobustness Metrics:")
    print("Robust CV Scores:", robust_cv_scores)
    print("Mean CV Score:", np.mean(robust_cv_scores))
    print("CV Score Standard Deviation:", np.std(robust_cv_scores))
    
    save_feature_names(selected_feature_names)

    plot_feature_importance(best_model, selected_feature_names, top_n=20)

    
    pipeline = {
        'model': best_model,
        'scaler': scaler,
        'selector': selector,
        'variance_selector': variance_selector,
        'selected_features': selected_feature_names
    }
    joblib.dump(pipeline, 'pipeline_components_with_features.joblib')
    print("Pipeline saved as 'pipeline_components_with_features.joblib'")

# Call the main function
if __name__ == "__main__":
    main(X, y, X.columns.tolist())
    