In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import warnings

In [2]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
sns.set(style="darkgrid")

In [None]:
from google.colab import drive
drive.mount('/content/drive')
train = pd.read_csv("/content/drive/My Drive/Datasets/NIDS/Train_data.csv")
test = pd.read_csv("/content/drive/My Drive/Datasets/NIDS/Test_data.csv")

In [None]:
print(train.head(4))
print(f"Training data has {train.shape[0]} rows & {train.shape[1]} columns")
print(test.head(4))
print(f"Testing data has {test.shape[0]} rows & {test.shape[1]} columns")

In [5]:
train.drop(['num_outbound_cmds'], axis=1, inplace=True)
test.drop(['num_outbound_cmds'], axis=1, inplace=True)

In [6]:
scaler = StandardScaler()
num_cols = train.select_dtypes(include=['float64', 'int64']).columns
train[num_cols] = scaler.fit_transform(train[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

In [7]:
cat_cols = train.select_dtypes(include=['object']).columns.drop('class')
for col in cat_cols:
    combined_data = pd.concat([train[col], test[col]], axis=0)
    encoder = LabelEncoder()
    encoder.fit(combined_data)

    train[col] = encoder.transform(train[col])
    test[col] = encoder.transform(test[col])

In [8]:
train['class'] = encoder.fit_transform(train['class'])

In [9]:
X = train.drop('class', axis=1)
y = train['class']

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=2)


In [None]:
model = RandomForestClassifier(random_state=0)
model.fit(X_train, y_train)


In [12]:
def evaluate_model(model, X_train, y_train, X_val, y_val):
    cv_scores = cross_val_score(model, X_train, y_train, cv=10)
    print("Cross Validation Mean Score:", cv_scores.mean())

    train_accuracy = metrics.accuracy_score(y_train, model.predict(X_train))
    print("Training Accuracy:", train_accuracy)

    val_accuracy = metrics.accuracy_score(y_val, model.predict(X_val))
    print("Validation Accuracy:", val_accuracy)
    print("Classification Report:\n", metrics.classification_report(y_val, model.predict(X_val)))
    print("Confusion Matrix:\n", metrics.confusion_matrix(y_val, model.predict(X_val)))


In [13]:
evaluate_model(model, X_train, y_train, X_val, y_val)

Cross Validation Mean Score: 0.9971079110411418
Training Accuracy: 1.0
Validation Accuracy: 0.9969568668960043
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3498
           1       1.00      1.00      1.00      4060

    accuracy                           1.00      7558
   macro avg       1.00      1.00      1.00      7558
weighted avg       1.00      1.00      1.00      7558

Confusion Matrix:
 [[3485   13]
 [  10 4050]]


In [None]:
importances = pd.DataFrame({'feature': X.columns, 'importance': model.feature_importances_}).sort_values(by='importance', ascending=False)
plt.figure(figsize=(12, 6))
sns.barplot(x='importance', y='feature', data=importances)
plt.title("Feature Importance")
plt.show()

In [None]:
import joblib
joblib.dump(model, "/content/drive/MyDrive/models/NIDS/finalmodel.pkl")
joblib.dump(scaler, "/content/drive/MyDrive/models/NIDS/scaler.pkl")
joblib.dump(encoder, "/content/drive/MyDrive/models/NIDS/encoder.pkl")