# Random Forest Implementation

## Preprocessing

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

In [None]:
# Define the preprocess_df function
def preprocess_df(df):

    # Exclude 'type' from the list of categorical columns for encoding
    categorical_cols = df.select_dtypes(include=['object', 'bool']).columns.tolist()
    categorical_cols.remove('type')  # Exclude 'type' column

    # Fill missing numeric values with the median
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.median()))

    # Fill missing categorical values with the mode (excluding 'type')
    df[categorical_cols] = df[categorical_cols].apply(lambda x: x.fillna(x.mode()[0]))

    # Encode the categorical variables
    df = pd.get_dummies(df, columns=categorical_cols)

    # Proceed with scaling the numeric features
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    return df

In [None]:
# Load the dataset
df = pd.read_csv('train_test_network.csv')

# Preprocess the data using the preprocess_df function
df = preprocess_df(df)

## Training model

In [None]:
# Separate the features (X) from the target variable (y)
X = df.drop('type', axis=1)  # Features
y = df['type']  # Target variable

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the size of the splits
print(f"Training set size: {X_train.shape[0]} rows")
print(f"Testing set size: {X_test.shape[0]} rows")

Training set size: 168834 rows
Testing set size: 42209 rows


In [None]:
# Train the Random Forest model
rf_classifier = RandomForestClassifier(n_estimators=5, criterion='entropy', random_state=42)
rf_classifier.fit(X_train, y_train)

In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

def select_features(X_train, y_train, num_features_to_select):
    estimator = RandomForestClassifier(n_estimators=100, random_state=42)
    rfe = RFE(estimator, n_features_to_select=num_features_to_select)
    rfe.fit(X_train, y_train)
    selected_feature_indices = rfe.support_
    selected_feature_names = X_train.columns[selected_feature_indices]
    return selected_feature_names

In [None]:
# Call select_features function
selected_feature_names = select_features(X_train, y_train, num_features_to_select=10)

# Display selected feature names
print("Selected Feature Names:")
print(selected_feature_names)

In [None]:
# Evaluate the model on the testing set
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:\n', report)

## Visualization

In [None]:
# Undestanding the number of trees in the random forest
num_trees = rf_classifier.n_estimators
print("Number of trees in the Random Forest:", num_trees)

In [None]:
# Plotting individual trees in the Random Forest
plt.figure(figsize=(20, 10))
for i in range(3):
    plt.subplot(1, 3, i+1)
    plot_tree(rf_classifier.estimators_[i], filled=True, feature_names=X.columns, class_names=y.unique())
    plt.title(f'Decision Tree {i+1}')
plt.show()


## Testing

The following datasets can be found at: https://research.unsw.edu.au/projects/unsw-nb15-dataset

Network_dataset_10 contains instances of normal traffic and dos attacks

In [None]:
# Load and preprocess the new dataset
new_data = pd.read_csv('Network_dataset_10.csv')
new_data = preprocess_df(new_data)

# Separate features (X_new_data) from the target variable (y_new_data)
X_new_data = new_data.drop('type', axis=1)
y_new_data = new_data['type']

# Ensure feature names match those seen during training
new_X = X_new_data.reindex(columns=X.columns, fill_value=0)

# Predict using the trained model
new_y_pred = rf_classifier.predict(new_X)

# Evaluate the model
new_accuracy = accuracy_score(y_new_data, new_y_pred)
new_report = classification_report(y_new_data, new_y_pred)

# Print the accuracy and classification report
print(f'Accuracy of the Random Forest model on the new dataset 10: {new_accuracy:.2f}')
print('Classification Report for the new dataset 10:\n', new_report)

In [None]:
# Load and preprocess the new dataset
new_data = pd.read_csv('Network_dataset_17.csv')
new_data = preprocess_df(new_data)

# Separate features (X_new_data) from the  variable (y_new_data)
X_new_data = new_data.drop('type', axis=1)
y_new_data = new_data['type']

# Ensure feature names match those seen during training
new_X = X_new_data.reindex(columns=X.columns, fill_value=0)

# Predict using the trained model
new_y_pred = rf_classifier.predict(new_X)

# Evaluate the model
new_accuracy = accuracy_score(y_new_data, new_y_pred)
new_report = classification_report(y_new_data, new_y_pred)

# Print the accuracy and classification report
print(f'Accuracy of the Random Forest model on the new dataset 17: {new_accuracy:.2f}')
print('Classification Report for the new dataset 17:\n', new_report)

In [None]:
# Load and preprocess the new dataset
new_data = pd.read_csv('Network_dataset_11.csv')
new_data = preprocess_df(new_data)

# Separate features (X_new_data) from the  variable (y_new_data)
X_new_data = new_data.drop('type', axis=1)
y_new_data = new_data['type']

# Ensure feature names match those seen during training
new_X = X_new_data.reindex(columns=X.columns, fill_value=0)

# Predict using the trained model
new_y_pred = rf_classifier.predict(new_X)

# Evaluate the model
new_accuracy = accuracy_score(y_new_data, new_y_pred)
new_report = classification_report(y_new_data, new_y_pred)

# Print the accuracy and classification report
print(f'Accuracy of the Random Forest model on the new dataset 11: {new_accuracy:.2f}')
print('Classification Report for the new dataset 11:\n', new_report)