# Random Forest Implementation

## Preprocessing

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

In [2]:
# Define the preprocess_df function
def preprocess_df(df):
    # Define features to drop, including IP addresses, ports, and other specified features
    features_to_drop = ['src_ip', 'dst_ip', 'src_port', 'service','dst_port', 'ssl_version', 'ssl_cipher', 'ssl_subject', 'ssl_issuer','dns_query','dns_qclass','dns_qtype','dns_rcode','http_request_body_len','http_version', 'http_trans_depth','http_method','http_uri','http_response_body_len','http_status_code','http_user_agent','http_orig_mime_types','http_resp_mime_types','weird_name','weird_addl','weird_notice']

    # Check if 'ts' column exists, and drop it if it does
    if 'ts' in df.columns:
        features_to_drop.append('ts')

    df = df.drop(columns=features_to_drop)

    # Exclude 'type' from the list of categorical columns for encoding
    categorical_cols = df.select_dtypes(include=['object', 'bool']).columns.tolist()
    categorical_cols.remove('type')  # Exclude 'type' column

    # Fill missing numeric values with the median
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.median()))

    # Fill missing categorical values with the mode (excluding 'type')
    df[categorical_cols] = df[categorical_cols].apply(lambda x: x.fillna(x.mode()[0]))

    # Encode the categorical variables
    df = pd.get_dummies(df, columns=categorical_cols)

    # Proceed with scaling the numeric features
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    return df

In [3]:
# Load the dataset
df = pd.read_csv('train_test_network.csv')

# Preprocess the data using the preprocess_df function
df = preprocess_df(df)

## Training model

In [4]:
# Separate the features (X) from the target variable (y)
X = df.drop('type', axis=1)  # Features
y = df['type']  # Target variable

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the size of the splits
print(f"Training set size: {X_train.shape[0]} rows")
print(f"Testing set size: {X_test.shape[0]} rows")

Training set size: 168834 rows
Testing set size: 42209 rows


In [5]:
# Train the Random Forest model
rf_classifier = RandomForestClassifier(n_estimators=5, criterion='entropy', random_state=42)
rf_classifier.fit(X_train, y_train)

In [6]:
# Evaluate the model on the testing set
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:\n', report)

Accuracy: 0.98
Classification Report:
               precision    recall  f1-score   support

    backdoor       1.00      1.00      1.00      3919
        ddos       0.98      0.97      0.98      4065
         dos       0.99      0.99      0.99      3934
   injection       0.96      0.97      0.97      3978
        mitm       0.77      0.78      0.78       213
      normal       1.00      1.00      1.00     10021
    password       0.99      0.98      0.98      3968
  ransomware       0.91      0.98      0.95      4047
    scanning       0.99      0.99      0.99      4015
         xss       0.95      0.90      0.93      4049

    accuracy                           0.98     42209
   macro avg       0.96      0.96      0.96     42209
weighted avg       0.98      0.98      0.98     42209



## Visualization

In [7]:
# Undestanding the number of trees in the random forest
num_trees = rf_classifier.n_estimators
print("Number of trees in the Random Forest:", num_trees)

Number of trees in the Random Forest: 5


In [8]:
# # Plotting individual trees in the Random Forest
# plt.figure(figsize=(20, 10))
# for i in range(3):
#     plt.subplot(1, 3, i+1)
#     plot_tree(rf_classifier.estimators_[i], filled=True, feature_names=X.columns, class_names=y.unique())
#     plt.title(f'Decision Tree {i+1}')
# plt.show()


## Testing

The following datasets can be found at: https://research.unsw.edu.au/projects/unsw-nb15-dataset

Network_dataset_10 contains instances of normal traffic and dos attacks

In [9]:
def test_on_dataset(num):
       
    number = num
    # Load and preprocess the new dataset
    new_data = pd.read_csv('Network_dataset_' + number + '.csv')
    new_data = preprocess_df(new_data)

    # Separate features (X_new_data) from the target variable (y_new_data)
    X_new_data = new_data.drop('type', axis=1)
    y_new_data = new_data['type']
    
    try:
        # Ensure feature names match those seen during training
        new_X = X_new_data.reindex(columns=X.columns, fill_value=0)

        # Predict using the trained model
        new_y_pred = rf_classifier.predict(new_X)

        # Evaluate the model
        new_accuracy = accuracy_score(y_new_data, new_y_pred)
        new_report = classification_report(y_new_data, new_y_pred)

        # Print the accuracy and classification report
        print(f'Accuracy of the Random Forest model on the new dataset '+ number +': {new_accuracy:.2f}')
        print('Classification Report for the new dataset '+ number +':\n', new_report)
    except ValueError:

        print("Couldn't load dataset, Duplicate Indexes detected:")
        duplicates = X_new_data[X_new_data.index.duplicated()] 
        print(duplicates)
        


In [10]:
test_on_dataset('1')
## error: cannot reindex on an axis with duplicate labels

# Interesting output

  new_data = pd.read_csv('Network_dataset_' + number + '.csv')


Couldn't load dataset, Duplicate Indexes detected:
Empty DataFrame
Columns: [duration, dst_bytes, missed_bytes, src_pkts, src_ip_bytes, dst_pkts, dst_ip_bytes, label, proto_icmp, proto_tcp, proto_udp, src_bytes_0, src_bytes_1, src_bytes_2, src_bytes_3, src_bytes_4, src_bytes_6, src_bytes_7, src_bytes_8, src_bytes_11, src_bytes_12, src_bytes_16, src_bytes_17, src_bytes_18, src_bytes_19, src_bytes_20, src_bytes_21, src_bytes_22, src_bytes_23, src_bytes_24, src_bytes_26, src_bytes_27, src_bytes_28, src_bytes_29, src_bytes_30, src_bytes_31, src_bytes_32, src_bytes_33, src_bytes_34, src_bytes_35, src_bytes_36, src_bytes_37, src_bytes_38, src_bytes_39, src_bytes_40, src_bytes_41, src_bytes_42, src_bytes_43, src_bytes_44, src_bytes_45, src_bytes_46, src_bytes_47, src_bytes_48, src_bytes_49, src_bytes_50, src_bytes_51, src_bytes_52, src_bytes_53, src_bytes_54, src_bytes_55, src_bytes_56, src_bytes_57, src_bytes_58, src_bytes_59, src_bytes_60, src_bytes_61, src_bytes_62, src_bytes_63, src_bytes

In [11]:
test_on_dataset('2')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of the Random Forest model on the new dataset 2: {new_accuracy:.2f}
Classification Report for the new dataset 2:
               precision    recall  f1-score   support

    backdoor       0.00      0.00      0.00         0
        ddos       0.00      0.00      0.00         0
         dos       0.00      0.00      0.00         0
   injection       0.00      0.00      0.00         0
        mitm       0.00      0.00      0.00         0
      normal       1.00      1.00      1.00      5717
    password       0.00      0.00      0.00         0
    scanning       0.00      0.00      0.00    994283
         xss       0.00      0.00      0.00         0

    accuracy                           0.01   1000000
   macro avg       0.11      0.11      0.11   1000000
weighted avg       0.01      0.01      0.01   1000000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
test_on_dataset('3')

FileNotFoundError: [Errno 2] No such file or directory: 'Network_dataset_3.csv'

In [None]:
test_on_dataset('4')

In [None]:
test_on_dataset('5')

In [None]:
test_on_dataset('6')
## crashed kernel

In [None]:
test_on_dataset('7')

In [None]:
test_on_dataset('8')

In [None]:
test_on_dataset('9')

In [None]:
test_on_dataset('10')

In [None]:
test_on_dataset('11')

In [None]:
test_on_dataset('12')

In [None]:
test_on_dataset('13')

In [None]:
test_on_dataset('14')

In [None]:
test_on_dataset('15')

In [None]:
test_on_dataset('16')

In [None]:
test_on_dataset('17')

In [None]:
test_on_dataset('18')

In [None]:
test_on_dataset('19')

In [None]:
test_on_dataset('20')

In [None]:
test_on_dataset('21')

In [None]:
test_on_dataset('22')
## error: cannot reindex on an axis with duplicate labels

In [None]:
test_on_dataset('23')
## error: cannot reindex on an axis with duplicate labels