Random Forest

In [4]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
file_path = 'cyber_dataset.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Clean columns with string representations of lists
def clean_column(column):
    return column.apply(lambda x: float(ast.literal_eval(x)[0]) if isinstance(x, str) and '[' in x else float(x))

# Apply cleaning to relevant columns
columns_to_clean = ['DnsAnswerTTL', 'NumberOfAnswers', 'DnsResponseCode', 'DnsOpCode']
for col in columns_to_clean:
    data[col] = clean_column(data[col])

# Define features and target
features = ['DnsAnswerTTL', 'NumberOfAnswers', 'DnsResponseCode', 'DnsOpCode']
data['anomaly'] = np.where((data['sus'] == 1) | (data['evil'] == 1), 1, 0)  # Label anomalies
data = data.dropna(subset=features)  # Ensure no missing values
X = data[features].astype(float)
y = data['anomaly']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build and train the Random Forest model with tuned hyperparameters
rf_model = RandomForestClassifier(
    random_state=42,
    n_estimators=200,  # Increased number of trees
    max_depth=10,      # Limit depth of the trees
    min_samples_split=5,  # Minimum samples required to split
    min_samples_leaf=2   # Minimum samples in leaf nodes
)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy (Random Forest): {accuracy:.4f}")

# Print classification report and confusion matrix
print("\nClassification Report (Random Forest):")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix (Random Forest):")
print(confusion_matrix(y_test, y_pred))


Test Accuracy (Random Forest): 0.9231

Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        12
           1       0.00      0.00      0.00         1

    accuracy                           0.92        13
   macro avg       0.46      0.50      0.48        13
weighted avg       0.85      0.92      0.89        13


Confusion Matrix (Random Forest):
[[12  0]
 [ 1  0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Decision tree

In [5]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
file_path = 'cyber_dataset.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Clean columns with string representations of lists
def clean_column(column):
    return column.apply(lambda x: float(ast.literal_eval(x)[0]) if isinstance(x, str) and '[' in x else float(x))

# Apply cleaning to relevant columns
columns_to_clean = ['DnsAnswerTTL', 'NumberOfAnswers', 'DnsResponseCode', 'DnsOpCode']
for col in columns_to_clean:
    data[col] = clean_column(data[col])

# Define features and target
features = ['DnsAnswerTTL', 'NumberOfAnswers', 'DnsResponseCode', 'DnsOpCode']
data['anomaly'] = np.where((data['sus'] == 1) | (data['evil'] == 1), 1, 0)  # Label anomalies
data = data.dropna(subset=features)  # Ensure no missing values
X = data[features].astype(float)
y = data['anomaly']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build and train the Decision Tree model with tuned hyperparameters
dt_model = DecisionTreeClassifier(
    random_state=42,
    max_depth=8,              # Reduced tree depth
    min_samples_split=10,     # Minimum samples required to split
    min_samples_leaf=4,       # Minimum samples in leaf nodes
    criterion='entropy'       # Use entropy instead of Gini
)
dt_model.fit(X_train, y_train)

# Make predictions
y_pred = dt_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy (Decision Tree): {accuracy:.4f}")

# Print classification report and confusion matrix
print("\nClassification Report (Decision Tree):")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix (Decision Tree):")
print(confusion_matrix(y_test, y_pred))


Test Accuracy (Decision Tree): 0.9231

Classification Report (Decision Tree):
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        12
           1       0.00      0.00      0.00         1

    accuracy                           0.92        13
   macro avg       0.46      0.50      0.48        13
weighted avg       0.85      0.92      0.89        13


Confusion Matrix (Decision Tree):
[[12  0]
 [ 1  0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
