In [9]:
# Anomaly-based Network Intrusion Detection System (NIDS) - Data Collection & Preprocessing

## Importing Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
import joblib

# Display plots inline
%matplotlib inline

## Loading the Training Dataset (KDD Cup 99 - 10% Subset)
train_data = pd.read_csv('kddcup.data_10_percent', header=None)
print('Training Data Loaded Successfully')
print(f'Shape of Training Data: {train_data.shape}')

## Loading the Testing Dataset (KDD Cup 99 - 10% Subset Corrected)
test_data = pd.read_csv('kddcup.data_10_percent_corrected', header=None)
print('Testing Data Loaded Successfully')
print(f'Shape of Testing Data: {test_data.shape}')

## Naming Columns (Based on KDD Cup 99 Dataset Description)
columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
    'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
    'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
    'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label'
]
train_data.columns = columns
test_data.columns = columns

## Checking for Missing Values
print('Missing Values in Training Data:', train_data.isnull().sum().sum())
print('Missing Values in Testing Data:', test_data.isnull().sum().sum())

## Removing Duplicates
train_data.drop_duplicates(inplace=True)
test_data.drop_duplicates(inplace=True)

## Encoding Categorical Features
train_data = pd.get_dummies(train_data, columns=['protocol_type', 'service', 'flag'])
test_data = pd.get_dummies(test_data, columns=['protocol_type', 'service', 'flag'])

## Aligning Columns of Training and Testing Data
train_data, test_data = train_data.align(test_data, join='inner', axis=1)

## Splitting Data into Features and Labels
X_train = train_data.drop(['label'], axis=1)
y_train = train_data['label']

X_test = test_data.drop(['label'], axis=1)
y_test = test_data['label']

## Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Feature Selection - Selecting Top 20 Features
selector = SelectKBest(score_func=f_classif, k=20)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

## Saving Selected Feature Names
selected_mask = selector.get_support()
selected_features = X_train.columns[selected_mask].tolist()
joblib.dump(selected_features, 'model_columns.pkl')
print("Selected feature names saved to 'model_columns.pkl'")

## Splitting Training Data for Model Validation
X_train_final, X_val, y_train_final, y_val = train_test_split(X_train_selected, y_train, test_size=0.2, random_state=42)

print('✅ Data Preprocessing Completed Successfully')


Training Data Loaded Successfully
Shape of Training Data: (494021, 42)
Testing Data Loaded Successfully
Shape of Testing Data: (494021, 42)
Missing Values in Training Data: 0
Missing Values in Testing Data: 0
Selected feature names saved to 'model_columns.pkl'
✅ Data Preprocessing Completed Successfully


  f = msb / msw


In [12]:
# Anomaly-based Network Intrusion Detection System (NIDS) - Model Building

## Importing Necessary Libraries for Model Building
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib


## Initializing Models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'SVM': SVC(kernel='linear'),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Logistic Regression': LogisticRegression(max_iter=1000)
}


## Training and Evaluating Models
model_performance = {}

for model_name, model in models.items():
    # Training the model
    model.fit(X_train_final, y_train_final)
    
    # Making predictions on the validation set
    y_pred = model.predict(X_val)
    
    # Calculating accuracy
    accuracy = accuracy_score(y_val, y_pred)
    model_performance[model_name] = accuracy
    
    # Displaying the results
    print(f'\nModel: {model_name}')
    print(f'Accuracy: {accuracy * 100:.2f}%')
    print('Classification Report:')
    print(classification_report(y_val, y_pred))
    print('Confusion Matrix:')
    print(confusion_matrix(y_val, y_pred))


## Displaying Best Model
best_model_name = max(model_performance, key=model_performance.get)
best_model = models[best_model_name]
print(f'\nBest Performing Model: {best_model_name} with Accuracy: {model_performance[best_model_name] * 100:.2f}%')


## Saving the Best Model
joblib.dump(best_model, 'best_model.pkl')
print('Best Model Saved Successfully as best_model.pkl')


Model: Random Forest
Accuracy: 99.60%
Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                  precision    recall  f1-score   support

           back.       1.00      1.00      1.00       214
buffer_overflow.       0.75      0.50      0.60         6
      ftp_write.       0.00      0.00      0.00         3
   guess_passwd.       1.00      0.91      0.95        11
           imap.       0.67      1.00      0.80         2
        ipsweep.       0.70      0.74      0.72       134
           land.       1.00      1.00      1.00         1
       multihop.       0.00      0.00      0.00         0
        neptune.       1.00      1.00      1.00     10326
           nmap.       0.71      0.38      0.50        39
         normal.       1.00      1.00      1.00     17610
           perl.       0.00      0.00      0.00         1
            phf.       0.00      0.00      0.00         1
            pod.       1.00      1.00      1.00        38
      portsweep.       0.99      0.95      0.97        91
        rootkit.       0.00      0.00      0.00         1
          sat

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                  precision    recall  f1-score   support

           back.       1.00      1.00      1.00       214
buffer_overflow.       0.67      0.67      0.67         6
      ftp_write.       0.00      0.00      0.00         3
   guess_passwd.       0.91      0.91      0.91        11
           imap.       1.00      0.50      0.67         2
        ipsweep.       0.70      0.73      0.72       134
           land.       1.00      1.00      1.00         1
       multihop.       0.00      0.00      0.00         0
        neptune.       1.00      1.00      1.00     10326
           nmap.       0.50      0.38      0.43        39
         normal.       1.00      1.00      1.00     17610
           perl.       0.00      0.00      0.00         1
            phf.       1.00      1.00      1.00         1
            pod.       1.00      1.00      1.00        38
      portsweep.       0.99      0.93      0.96        91
        rootkit.       0.00      0.00      0.00         1
          sat

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                  precision    recall  f1-score   support

           back.       0.97      0.13      0.23       214
buffer_overflow.       0.00      0.00      0.00         6
      ftp_write.       0.00      0.00      0.00         3
   guess_passwd.       1.00      0.18      0.31        11
           imap.       0.67      1.00      0.80         2
        ipsweep.       0.66      0.98      0.79       134
           land.       1.00      1.00      1.00         1
        neptune.       1.00      1.00      1.00     10326
           nmap.       1.00      0.10      0.19        39
         normal.       0.98      1.00      0.99     17610
           perl.       0.00      0.00      0.00         1
            phf.       0.00      0.00      0.00         1
            pod.       1.00      1.00      1.00        38
      portsweep.       0.94      0.86      0.90        91
        rootkit.       0.00      0.00      0.00         1
          satan.       0.97      0.86      0.91       170
          smu

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                  precision    recall  f1-score   support

           back.       0.82      0.95      0.88       214
buffer_overflow.       0.67      0.67      0.67         6
      ftp_write.       0.00      0.00      0.00         3
   guess_passwd.       0.83      0.91      0.87        11
           imap.       1.00      1.00      1.00         2
        ipsweep.       0.71      0.78      0.74       134
           land.       1.00      1.00      1.00         1
        neptune.       1.00      1.00      1.00     10326
           nmap.       0.83      0.26      0.39        39
         normal.       1.00      0.99      1.00     17610
           perl.       0.00      0.00      0.00         1
            phf.       0.00      0.00      0.00         1
            pod.       1.00      1.00      1.00        38
      portsweep.       0.95      0.91      0.93        91
        rootkit.       0.00      0.00      0.00         1
          satan.       0.98      0.95      0.96       170
          smu

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                  precision    recall  f1-score   support

           back.       0.90      0.13      0.23       214
buffer_overflow.       1.00      0.17      0.29         6
      ftp_write.       0.00      0.00      0.00         3
   guess_passwd.       1.00      0.18      0.31        11
           imap.       1.00      1.00      1.00         2
        ipsweep.       0.63      0.96      0.76       134
           land.       1.00      1.00      1.00         1
        neptune.       1.00      1.00      1.00     10326
           nmap.       1.00      0.10      0.19        39
         normal.       0.98      1.00      0.99     17610
           perl.       0.00      0.00      0.00         1
            phf.       0.00      0.00      0.00         1
            pod.       1.00      1.00      1.00        38
      portsweep.       0.97      0.81      0.89        91
        rootkit.       0.00      0.00      0.00         1
          satan.       0.99      0.86      0.92       170
          smu

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
