# Task 5.2DHD

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, zero_one_loss, classification_report


def print_score(classifier_name: str, y_test_data, y_pred_data):
    print(f"\n{classifier_name}\n")
    print(f"Zero One Loss: {zero_one_loss(y_test_data, y_pred_data):.4f}")
    print(f"Accuracy: {accuracy_score(y_test_data, y_pred_data):.2f}")
    print("\nClassification Report:\n", classification_report(y_test_data, y_pred_data))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test_data, y_pred_data))


# Cleaned preprocessing function
def preprocess_data(df: pd.DataFrame) -> tuple:
    """Prepare and preprocess the dataset."""
    df = df.copy()
    # Data cleaning
    df = df.drop('num_outbound_cmds', axis=1)
    df['su_attempted'] = df['su_attempted'].replace({2: 0})

    # Define columns
    nominal = ['protocol_type', 'service', 'flag']
    binary = ['land', 'logged_in', 'root_shell',
              'su_attempted', 'is_host_login', 'is_guest_login']
    numeric = [col for col in df.columns if col not in nominal + binary + ['attack_category', 'attack_type']]

    return df[nominal + numeric + binary], df['attack_category']


# Load and preprocess data
df = pd.read_csv('data/Week_5_NSL-KDD-Dataset/kdd_merged.csv')
X, y = preprocess_data(df)

# Encode target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Create preprocessing pipeline
preprocessor = ColumnTransformer([
    ('encoder', OneHotEncoder(
        handle_unknown='ignore',
        sparse_output=False  # Return array instead of sparse matrix
    ), ['protocol_type', 'service', 'flag'])
], remainder='passthrough')  # Automatically passes through numeric/binary cols

# Create complete pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        class_weight='balanced',
        random_state=42
    ))
])

# Stratified cross-validation
sss = StratifiedShuffleSplit(n_splits=4, test_size=0.3, random_state=42)
scores = []

for train_index, test_index in sss.split(X, y_encoded):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]

    pipeline.fit(X_train, y_train)
    pred = pipeline.predict(X_test)
    scores.append(accuracy_score(y_test, pred))
    print_score("SVN with Train Index:" + str(train_index), y_test, pred)

print("Cross-validation scores:", scores)
print(f"Mean accuracy: {sum(scores) / len(scores):.4f}")


SVN with Train Index:[ 71900  60960  50914 ... 130465  82432  64740]

Zero One Loss: 0.0153
Accuracy: 0.98

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99     23117
           1       1.00      0.99      0.99     16069
           2       0.97      0.98      0.98      4224
           3       0.68      0.99      0.80      1071
           4       0.90      0.87      0.88        75

    accuracy                           0.98     44556
   macro avg       0.91      0.96      0.93     44556
weighted avg       0.99      0.98      0.99     44556


Confusion Matrix:
 [[22704    30   109   273     1]
 [    9 15897    10   153     0]
 [    0     0  4146    77     1]
 [    0     0     2  1064     5]
 [    1     0     0     9    65]]

SVN with Train Index:[ 50476  10894 134662 ...    828   3769  32323]

Zero One Loss: 0.0149
Accuracy: 0.99

Classification Report:
               precision    recall  f1-score   support

    