# Decision Tree Implementation

## Preprocessing

In [9]:
import pandas as pd

# Load the dataset
data_path = 'train_test_network.csv'
df = pd.read_csv(data_path)

# Define features to drop, including IP addresses, ports, and other specified features
features_to_drop = ['src_ip', 'dst_ip', 'src_port', 'dst_port', 'ssl_version', 'ssl_cipher', 'ssl_subject', 'ssl_issuer']
df = df.drop(columns=features_to_drop)

# Exclude 'type' from the list of categorical columns for encoding
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns.tolist()
categorical_cols.remove('type')  # Exclude 'type' column

# Fill missing numeric values with the median
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.median()))

# Fill missing categorical values with the mode (excluding 'type')
df[categorical_cols] = df[categorical_cols].apply(lambda x: x.fillna(x.mode()[0]))

# Encode the categorical variables (excluding 'type')
df = pd.get_dummies(df, columns=categorical_cols)

# Now, 'type' is not altered and is ready to be used as the target variable for model training.

# Proceed with scaling the numeric features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Your dataset is now ready for training and testing the model.


## Training model

In [10]:
from sklearn.model_selection import train_test_split

# Separate the features (X) from the target variable (y)
X = df.drop('type', axis=1)  # Features
y = df['type']  # Target variable

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the size of the splits
print(f"Training set size: {X_train.shape[0]} rows")
print(f"Testing set size: {X_test.shape[0]} rows")


Training set size: 168834 rows
Testing set size: 42209 rows


In [11]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Train the model on the training set
dt_classifier.fit(X_train, y_train)

# Display a message to confirm training completion
print("Decision Tree model training completed.")


Decision Tree model training completed.


In [12]:
from sklearn.metrics import accuracy_score, classification_report

# Predict the labels for the test set
y_pred = dt_classifier.predict(X_test)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Decision Tree model on the test set: {accuracy:.2f}")

# Generate a classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)


Accuracy of the Decision Tree model on the test set: 0.98
Classification Report:
               precision    recall  f1-score   support

    backdoor       1.00      1.00      1.00      3919
        ddos       0.98      0.97      0.98      4065
         dos       0.99      0.98      0.99      3934
   injection       0.96      0.97      0.97      3978
        mitm       0.70      0.79      0.74       213
      normal       1.00      1.00      1.00     10021
    password       0.99      0.97      0.98      3968
  ransomware       0.91      0.98      0.95      4047
    scanning       0.99      0.99      0.99      4015
         xss       0.95      0.90      0.92      4049

    accuracy                           0.98     42209
   macro avg       0.95      0.96      0.95     42209
weighted avg       0.98      0.98      0.98     42209

