In [44]:
#Importing Libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [45]:
#reading the train dataset
df_train = pd.read_csv("/content/traindata.csv")

In [46]:
#encodeing the categorical data to numerical
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
df_train[["protocol_type", "service","flag"]] = encoder.fit_transform(df_train[["protocol_type", "service", "flag"]])

In [47]:
#splitting the train data
Y_train = df_train['Class']
x_train = df_train.drop('Class', axis=1)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(x_train, Y_train, test_size=0.4, random_state=42)

In [49]:
# Scale numerical features
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [50]:
#feature selection
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)

selector = SelectFromModel(rf, threshold="median", prefit=True)

X_train_selected = selector.transform(X_train_scaled)
X_test_selected = selector.transform(X_test_scaled)

print("Original Features:", X_train_scaled.shape[1])
print("Selected Features:", X_train_selected.shape[1])

Original Features: 43
Selected Features: 22


In [51]:
tree = DecisionTreeClassifier(
    random_state=0,
    ccp_alpha=0.009,
    max_depth=8,
    min_samples_split=10,
    min_samples_leaf=5
    )
tree.fit(X_train_selected, y_train)

print("Accuracy on training set: {:.3f}".format(tree.score(X_train_selected, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test_selected, y_test)))

Accuracy on training set: 0.970
Accuracy on test set: 0.968


In [53]:
# Get predictions
y_train_pred = tree.predict(X_train_selected)
y_test_pred = tree.predict(X_test_selected)

# Calculate F1-score
train_f1 = f1_score(y_train, y_train_pred)
test_f1 = f1_score(y_test, y_test_pred)

print("F1 Score on Training Set: {:.3f}".format(train_f1))
print("F1 Score on Test Set: {:.3f}".format(test_f1))

F1 Score on Training Set: 0.969
F1 Score on Test Set: 0.967
