# Random Forest Implementation

## Preprocessing

In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

In [4]:
# Define the preprocess_df function
def preprocess_df(df):
    # Define features to drop, including IP addresses, ports, and other specified features
    features_to_drop = ['src_ip', 'dst_ip', 'src_port', 'service','dst_port', 'ssl_version', 'ssl_cipher', 'ssl_subject', 'ssl_issuer','dns_query','dns_qclass','dns_qtype','dns_rcode','http_request_body_len','http_version', 'http_trans_depth','http_method','http_uri','http_response_body_len','http_status_code','http_user_agent','http_orig_mime_types','http_resp_mime_types','weird_name','weird_addl','weird_notice']

    # Check if 'ts' column exists, and drop it if it does
    if 'ts' in df.columns:
        features_to_drop.append('ts')

    df = df.drop(columns=features_to_drop)

    # Exclude 'type' from the list of categorical columns for encoding
    categorical_cols = df.select_dtypes(include=['object', 'bool']).columns.tolist()
    categorical_cols.remove('type')  # Exclude 'type' column

    # Fill missing numeric values with the median
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.median()))

    # Fill missing categorical values with the mode (excluding 'type')
    df[categorical_cols] = df[categorical_cols].apply(lambda x: x.fillna(x.mode()[0]))

    # Encode the categorical variables
    df = pd.get_dummies(df, columns=categorical_cols)

    # Proceed with scaling the numeric features
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    return df

In [5]:
# Load the dataset
df = pd.read_csv('train_test_network.csv')

# Preprocess the data using the preprocess_df function
df = preprocess_df(df)

## Training model

In [6]:
# Separate the features (X) from the target variable (y)
X = df.drop('type', axis=1)  # Features
y = df['type']  # Target variable

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the size of the splits
print(f"Training set size: {X_train.shape[0]} rows")
print(f"Testing set size: {X_test.shape[0]} rows")

Training set size: 168834 rows
Testing set size: 42209 rows


In [7]:
# Train the Random Forest model
rf_classifier = RandomForestClassifier(n_estimators=5, criterion='entropy', random_state=42)
rf_classifier.fit(X_train, y_train)

In [8]:
# Evaluate the model on the testing set
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:\n', report)

Accuracy: 0.98
Classification Report:
               precision    recall  f1-score   support

    backdoor       1.00      1.00      1.00      3919
        ddos       0.98      0.97      0.98      4065
         dos       0.99      0.99      0.99      3934
   injection       0.96      0.97      0.97      3978
        mitm       0.77      0.78      0.78       213
      normal       1.00      1.00      1.00     10021
    password       0.99      0.98      0.98      3968
  ransomware       0.91      0.98      0.95      4047
    scanning       0.99      0.99      0.99      4015
         xss       0.95      0.90      0.93      4049

    accuracy                           0.98     42209
   macro avg       0.96      0.96      0.96     42209
weighted avg       0.98      0.98      0.98     42209



## Visualization

In [9]:
# Undestanding the number of trees in the random forest
num_trees = rf_classifier.n_estimators
print("Number of trees in the Random Forest:", num_trees)

Number of trees in the Random Forest: 5


In [8]:
# # Plotting individual trees in the Random Forest
# plt.figure(figsize=(20, 10))
# for i in range(3):
#     plt.subplot(1, 3, i+1)
#     plot_tree(rf_classifier.estimators_[i], filled=True, feature_names=X.columns, class_names=y.unique())
#     plt.title(f'Decision Tree {i+1}')
# plt.show()


## Testing

The following datasets can be found at: https://research.unsw.edu.au/projects/unsw-nb15-dataset

Network_dataset_10 contains instances of normal traffic and dos attacks

In [10]:
def test_on_dataset(num):
    number = num
    # Load and preprocess the new dataset
    new_data = pd.read_csv('Network_dataset_' + number + '.csv')
    new_data = preprocess_df(new_data)

    # Separate features (X_new_data) from the target variable (y_new_data)
    X_new_data = new_data.drop('type', axis=1)
    y_new_data = new_data['type']

    # Ensure feature names match those seen during training
    new_X = X_new_data.reindex(columns=X.columns, fill_value=0)

    # Predict using the trained model
    new_y_pred = rf_classifier.predict(new_X)

    # Evaluate the model
    new_accuracy = accuracy_score(y_new_data, new_y_pred)
    new_report = classification_report(y_new_data, new_y_pred)

    # Print the accuracy and classification report
    print(f'Accuracy of the Random Forest model on the new dataset '+ number +': {new_accuracy:.2f}')
    print('Classification Report for the new dataset '+ number +':\n', new_report)

In [10]:
test_on_dataset('1')
## error: cannot reindex on an axis with duplicate labels

  new_data = pd.read_csv('Network_dataset_' + number + '.csv')


ValueError: cannot reindex on an axis with duplicate labels

In [11]:
test_on_dataset('2')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of the Random Forest model on the new dataset 2: {new_accuracy:.2f}
Classification Report for the new dataset 2:
               precision    recall  f1-score   support

    backdoor       0.00      0.00      0.00         0
        ddos       0.00      0.00      0.00         0
         dos       0.00      0.00      0.00         0
   injection       0.00      0.00      0.00         0
        mitm       0.00      0.00      0.00         0
      normal       1.00      1.00      1.00      5717
    password       0.00      0.00      0.00         0
    scanning       0.00      0.00      0.00    994283
         xss       0.00      0.00      0.00         0

    accuracy                           0.01   1000000
   macro avg       0.11      0.11      0.11   1000000
weighted avg       0.01      0.01      0.01   1000000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
test_on_dataset('3')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of the Random Forest model on the new dataset 3: {new_accuracy:.2f}
Classification Report for the new dataset 3:
               precision    recall  f1-score   support

    backdoor       0.00      0.00      0.00         0
        ddos       0.00      0.00      0.00         0
         dos       0.00      0.00      0.00         0
   injection       0.00      0.00      0.00         0
        mitm       0.00      0.00      0.00         0
      normal       1.00      1.00      1.00      2820
    password       0.00      0.00      0.00         0
    scanning       0.00      0.00      0.00    997180
         xss       0.00      0.00      0.00         0

    accuracy                           0.00   1000000
   macro avg       0.11      0.11      0.11   1000000
weighted avg       0.00      0.00      0.00   1000000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
test_on_dataset('4')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of the Random Forest model on the new dataset 4: {new_accuracy:.2f}
Classification Report for the new dataset 4:
               precision    recall  f1-score   support

    backdoor       0.00      0.00      0.00         0
        ddos       0.00      0.00      0.00         0
         dos       0.00      0.00      0.00         0
   injection       0.00      0.00      0.00         0
        mitm       0.00      0.00      0.00         0
      normal       1.00      1.00      1.00      6256
    password       0.00      0.00      0.00         0
    scanning       0.00      0.00      0.00    993744
         xss       0.00      0.00      0.00         0

    accuracy                           0.01   1000000
   macro avg       0.11      0.11      0.11   1000000
weighted avg       0.01      0.01      0.01   1000000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
test_on_dataset('5')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of the Random Forest model on the new dataset 5: {new_accuracy:.2f}
Classification Report for the new dataset 5:
               precision    recall  f1-score   support

    backdoor       0.00      0.00      0.00         0
        ddos       0.00      0.00      0.00         0
         dos       0.00      0.00      0.00         0
   injection       0.00      0.00      0.00         0
        mitm       0.00      0.00      0.00         0
      normal       1.00      0.98      0.99      3657
    password       0.00      0.00      0.00         0
    scanning       0.00      0.00      0.00    996343
         xss       0.00      0.00      0.00         0

    accuracy                           0.00   1000000
   macro avg       0.11      0.11      0.11   1000000
weighted avg       0.00      0.00      0.00   1000000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
test_on_dataset('6')
## crashed kernel

In [11]:
test_on_dataset('7')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of the Random Forest model on the new dataset 7: {new_accuracy:.2f}
Classification Report for the new dataset 7:
               precision    recall  f1-score   support

    backdoor       0.00      0.00      0.00         0
        ddos       0.00      0.00      0.00         0
         dos       0.00      0.00      0.00         0
   injection       0.00      0.00      0.00         0
        mitm       0.00      0.00      0.00         0
      normal       1.00      0.98      0.99      7499
    password       0.00      0.00      0.00         0
    scanning       1.00      0.00      0.00    992501
         xss       0.00      0.00      0.00         0

    accuracy                           0.01   1000000
   macro avg       0.22      0.11      0.11   1000000
weighted avg       1.00      0.01      0.01   1000000



  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
test_on_dataset('8')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of the Random Forest model on the new dataset 8: {new_accuracy:.2f}
Classification Report for the new dataset 8:
               precision    recall  f1-score   support

    backdoor       0.00      0.00      0.00         0
        ddos       0.00      0.00      0.00         0
         dos       0.77      0.93      0.85    590432
   injection       0.00      0.00      0.00         0
        mitm       0.00      0.00      0.00         0
      normal       0.98      0.99      0.99     21306
    password       0.00      0.00      0.00         0
    scanning       0.00      0.00      0.00    388262
         xss       0.00      0.00      0.00         0

    accuracy                           0.57   1000000
   macro avg       0.19      0.21      0.20   1000000
weighted avg       0.48      0.57      0.52   1000000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
test_on_dataset('9')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of the Random Forest model on the new dataset 9: {new_accuracy:.2f}
Classification Report for the new dataset 9:
               precision    recall  f1-score   support

    backdoor       0.00      0.00      0.00         0
        ddos       0.00      0.00      0.00         0
         dos       1.00      0.95      0.98    975261
   injection       0.00      0.00      0.00         0
        mitm       0.00      0.00      0.00         0
      normal       1.00      1.00      1.00     24739
    password       0.00      0.00      0.00         0
         xss       0.00      0.00      0.00         0

    accuracy                           0.95   1000000
   macro avg       0.25      0.24      0.25   1000000
weighted avg       1.00      0.95      0.98   1000000



  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
test_on_dataset('10')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of the Random Forest model on the new dataset 10: {new_accuracy:.2f}
Classification Report for the new dataset 10:
               precision    recall  f1-score   support

    backdoor       0.00      0.00      0.00         0
        ddos       0.00      0.00      0.00         0
         dos       1.00      0.95      0.98    969998
   injection       0.00      0.00      0.00         0
        mitm       0.00      0.00      0.00         0
      normal       1.00      1.00      1.00     30002
         xss       0.00      0.00      0.00         0

    accuracy                           0.96   1000000
   macro avg       0.29      0.28      0.28   1000000
weighted avg       1.00      0.96      0.98   1000000



  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
test_on_dataset('11')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of the Random Forest model on the new dataset 11: {new_accuracy:.2f}
Classification Report for the new dataset 11:
               precision    recall  f1-score   support

    backdoor       0.00      0.00      0.00         0
        ddos       0.00      0.00      0.00         0
         dos       0.91      0.98      0.94    839637
   injection       1.00      0.15      0.27    125195
        mitm       0.00      0.00      0.00         0
      normal       0.98      1.00      0.99     35168
    password       0.00      0.00      0.00         0
  ransomware       0.00      0.00      0.00         0
         xss       0.00      0.00      0.00         0

    accuracy                           0.88   1000000
   macro avg       0.32      0.24      0.24   1000000
weighted avg       0.93      0.88      0.86   1000000



  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
test_on_dataset('12')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of the Random Forest model on the new dataset 12: {new_accuracy:.2f}
Classification Report for the new dataset 12:
               precision    recall  f1-score   support

    backdoor       0.00      0.00      0.00         0
        ddos       0.37      0.22      0.28    639730
         dos       0.00      0.00      0.00         0
   injection       0.06      0.00      0.00    327464
        mitm       0.00      0.00      0.00         0
      normal       1.00      1.00      1.00     32806
    password       0.00      0.00      0.00         0
    scanning       0.00      0.00      0.00         0
         xss       0.00      0.00      0.00         0

    accuracy                           0.17   1000000
   macro avg       0.16      0.14      0.14   1000000
weighted avg       0.29      0.17      0.21   1000000



  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
test_on_dataset('13')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of the Random Forest model on the new dataset 13: {new_accuracy:.2f}
Classification Report for the new dataset 13:
               precision    recall  f1-score   support

    backdoor       0.00      0.00      0.00         0
        ddos       1.00      0.18      0.30    999083
         dos       0.00      0.00      0.00         0
   injection       0.00      0.00      0.00         0
        mitm       0.00      0.00      0.00         0
      normal       1.00      1.00      1.00       917
    password       0.00      0.00      0.00         0
    scanning       0.00      0.00      0.00         0
         xss       0.00      0.00      0.00         0

    accuracy                           0.18   1000000
   macro avg       0.22      0.13      0.14   1000000
weighted avg       1.00      0.18      0.30   1000000



  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
test_on_dataset('14')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of the Random Forest model on the new dataset 14: {new_accuracy:.2f}
Classification Report for the new dataset 14:
               precision    recall  f1-score   support

        ddos       1.00      0.19      0.32    999417
         dos       0.00      0.00      0.00         0
   injection       0.00      0.00      0.00         0
        mitm       0.00      0.00      0.00         0
      normal       1.00      0.99      1.00       583
    password       0.00      0.00      0.00         0
    scanning       0.00      0.00      0.00         0
         xss       0.00      0.00      0.00         0

    accuracy                           0.19   1000000
   macro avg       0.25      0.15      0.16   1000000
weighted avg       1.00      0.19      0.32   1000000



  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
test_on_dataset('15')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of the Random Forest model on the new dataset 15: {new_accuracy:.2f}
Classification Report for the new dataset 15:
               precision    recall  f1-score   support

    backdoor       0.00      0.00      0.00         0
        ddos       1.00      0.19      0.31    998940
         dos       0.00      0.00      0.00         0
   injection       0.00      0.00      0.00         0
      normal       1.00      0.99      1.00      1060
    password       0.00      0.00      0.00         0
    scanning       0.00      0.00      0.00         0
         xss       0.00      0.00      0.00         0

    accuracy                           0.19   1000000
   macro avg       0.25      0.15      0.16   1000000
weighted avg       1.00      0.19      0.32   1000000



  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
test_on_dataset('16')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of the Random Forest model on the new dataset 16: {new_accuracy:.2f}
Classification Report for the new dataset 16:
               precision    recall  f1-score   support

    backdoor       0.00      0.00      0.00         0
        ddos       1.00      0.15      0.26    998109
         dos       0.00      0.00      0.00         0
   injection       0.00      0.00      0.00         0
        mitm       0.00      0.00      0.00         0
      normal       1.00      0.99      1.00      1891
    password       0.00      0.00      0.00         0
    scanning       0.00      0.00      0.00         0
         xss       0.00      0.00      0.00         0

    accuracy                           0.15   1000000
   macro avg       0.22      0.13      0.14   1000000
weighted avg       1.00      0.15      0.26   1000000



  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
test_on_dataset('17')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of the Random Forest model on the new dataset 17: {new_accuracy:.2f}
Classification Report for the new dataset 17:
               precision    recall  f1-score   support

        ddos       1.00      0.64      0.78    966289
         dos       0.00      0.00      0.00         0
   injection       0.00      0.00      0.00         0
        mitm       0.00      0.00      0.00         0
      normal       1.00      1.00      1.00     33711
    password       0.00      0.00      0.00         0
         xss       0.00      0.00      0.00         0

    accuracy                           0.65   1000000
   macro avg       0.29      0.23      0.25   1000000
weighted avg       1.00      0.65      0.79   1000000



  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
test_on_dataset('18')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of the Random Forest model on the new dataset 18: {new_accuracy:.2f}
Classification Report for the new dataset 18:
               precision    recall  f1-score   support

    backdoor       0.00      0.00      0.00         0
        ddos       0.15      0.11      0.12    563440
         dos       0.00      0.00      0.00         0
   injection       0.00      0.00      0.00         0
        mitm       0.00      0.00      0.00         0
      normal       1.00      1.00      1.00     49436
    password       0.78      0.02      0.03    387124
    scanning       0.00      0.00      0.00         0
         xss       0.00      0.00      0.00         0

    accuracy                           0.11   1000000
   macro avg       0.22      0.12      0.13   1000000
weighted avg       0.44      0.11      0.13   1000000



  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
test_on_dataset('19')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of the Random Forest model on the new dataset 19: {new_accuracy:.2f}
Classification Report for the new dataset 19:
               precision    recall  f1-score   support

    backdoor       0.00      0.00      0.00         0
        ddos       0.00      0.00      0.00         0
         dos       0.00      0.00      0.00         0
   injection       0.00      0.00      0.00         0
        mitm       0.00      0.00      0.00         0
      normal       1.00      1.00      1.00     12708
    password       1.00      0.05      0.09    987292
         xss       0.00      0.00      0.00         0

    accuracy                           0.06   1000000
   macro avg       0.25      0.13      0.14   1000000
weighted avg       1.00      0.06      0.11   1000000



  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
test_on_dataset('20')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of the Random Forest model on the new dataset 20: {new_accuracy:.2f}
Classification Report for the new dataset 20:
               precision    recall  f1-score   support

    backdoor       0.00      0.00      0.00         0
        ddos       0.00      0.00      0.00         0
         dos       0.00      0.00      0.00         0
   injection       0.00      0.00      0.00         0
        mitm       0.00      0.00      0.00         0
      normal       0.98      1.00      0.99     21082
    password       0.61      0.31      0.41    344152
  ransomware       0.00      0.00      0.00         0
    scanning       0.00      0.00      0.00         0
         xss       0.94      0.27      0.42    634766

    accuracy                           0.30   1000000
   macro avg       0.25      0.16      0.18   1000000
weighted avg       0.82      0.30      0.43   1000000



  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
test_on_dataset('21')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of the Random Forest model on the new dataset 21: {new_accuracy:.2f}
Classification Report for the new dataset 21:
               precision    recall  f1-score   support

    backdoor       0.00      0.00      0.00         0
        ddos       0.00      0.00      0.00         0
         dos       0.00      0.00      0.00         0
   injection       0.00      0.00      0.00         0
        mitm       0.00      0.00      0.00         0
      normal       1.00      1.00      1.00     14407
    password       0.00      0.00      0.00         0
  ransomware       0.00      0.00      0.00         0
    scanning       0.00      0.00      0.00         0
         xss       1.00      0.40      0.57    985593

    accuracy                           0.41   1000000
   macro avg       0.20      0.14      0.16   1000000
weighted avg       1.00      0.41      0.58   1000000



  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
test_on_dataset('22')
## error: cannot reindex on an axis with duplicate labels

  new_data = pd.read_csv('Network_dataset_' + number + '.csv')


ValueError: cannot reindex on an axis with duplicate labels

In [27]:
test_on_dataset('23')
## error: cannot reindex on an axis with duplicate labels

  new_data = pd.read_csv('Network_dataset_' + number + '.csv')


ValueError: cannot reindex on an axis with duplicate labels