In [102]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.base import ClassifierMixin, BaseEstimator
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [103]:
# Membaca semua file csv training
additional_features_df = pd.read_csv('../dataset/train/additional_features_train.csv')
basic_features_df = pd.read_csv('../dataset/train/basic_features_train.csv')
content_features_df = pd.read_csv('../dataset/train/content_features_train.csv')
flow_features_df = pd.read_csv('../dataset/train/flow_features_train.csv')
labels_df = pd.read_csv('../dataset/train/labels_train.csv')
time_features_df = pd.read_csv('../dataset/train/time_features_train.csv')

# Menggabungkan data training dan testing untuk analisis EDA menyeluruh
data = pd.merge(basic_features_df, additional_features_df, on="id")
data = pd.merge(data, content_features_df, on="id")
data = pd.merge(data, flow_features_df, on="id")
data = pd.merge(data, labels_df, on="id")
data = pd.merge(data, time_features_df, on="id")

# Drop NaN values
data = data.dropna()
data = data.drop_duplicates()
data = data.drop(columns=['id'])

X = data.drop(columns=['label', 'attack_cat'])
X = X.select_dtypes(include=[np.number])

le = LabelEncoder()
y = data['attack_cat']
y = le.fit_transform(y)

data.head()

Unnamed: 0,state,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,service,sload,...,proto,attack_cat,label,sjit,djit,sinpkt,dinpkt,tcprtt,synack,ackdat
14,INT,8e-06,114.0,0.0,254.0,0.0,0.0,0.0,dns,57000000.0,...,udp,Generic,1,0.0,0.0,0.008,0.0,0.0,0.0,0.0
39,INT,1e-05,200.0,0.0,254.0,0.0,0.0,0.0,-,80000000.0,...,cpnx,Exploits,1,0.0,0.0,0.01,0.0,0.0,0.0,0.0
47,INT,8e-06,114.0,0.0,254.0,0.0,0.0,0.0,dns,57000000.0,...,udp,Generic,1,0.0,0.0,0.008,0.0,0.0,0.0,0.0
48,FIN,1.010866,7820.0,15736.0,31.0,29.0,30.0,32.0,-,61381.03,...,tcp,Normal,0,503.287312,501.260949,8.351438,8.148089,0.000632,0.000499,0.000133
73,FIN,0.84033,880.0,9528.0,62.0,252.0,2.0,4.0,http,7539.895,...,tcp,Exploits,1,5354.646413,5330.620386,93.37,59.661925,0.113626,0.064436,0.04919


In [104]:
clf = DecisionTreeClassifier(criterion="entropy", random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
predict_proba = clf.predict_proba(X_test)
cross_val = cross_val_score(clf, X, y, cv=5)
print(f"Score: {score}")
print(f"Cross Val: {cross_val}")

df = pd.DataFrame()
df['attack_cat'] = le.inverse_transform(y_test)
df['predicted'] = le.inverse_transform(clf.predict(X_test))
df.head()

Score: 0.774690637403689
Cross Val: [0.76675228 0.77305627 0.77305627 0.77585804 0.77725893]


Unnamed: 0,attack_cat,predicted
0,Exploits,DoS
1,Exploits,Exploits
2,Exploits,Exploits
3,Reconnaissance,Reconnaissance
4,Normal,Normal


In [105]:
class ID3(BaseEstimator, ClassifierMixin):
    def __init__(self, max_depth=5):
        self.max_depth = max_depth
    
    def _entropy(self, y):
        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        return -np.sum(probabilities * np.log2(probabilities + 1e-10))
    
    def _information_gain(self, X, y, feature_idx):
        total_entropy = self._entropy(y)
        
        unique_values = np.unique(X[:, feature_idx])
        
        weighted_entropy = 0
        for value in unique_values:
            mask = X[:, feature_idx] == value
            sub_y = y[mask]
            
            prob = len(sub_y) / len(y)
            weighted_entropy += prob * self._entropy(sub_y)
        
        return total_entropy - weighted_entropy
    
    def _build_tree(self, X, y, depth=0):
        unique_classes = np.unique(y)
        
        if len(unique_classes) == 1 or depth == self.max_depth:
            return np.argmax(np.bincount(y)) if len(y) > 0 else None
        
        if X.shape[1] == 0:
            return np.argmax(np.bincount(y))
        
        max_gain = -1
        best_feature = None
        for i in range(X.shape[1]):
            gain = self._information_gain(X, y, i)
            if gain > max_gain:
                max_gain = gain
                best_feature = i
        
        if max_gain == 0:
            return np.argmax(np.bincount(y))

        
        node = {'feature': best_feature}
        node['children'] = {}
        
        for value in np.unique(X[:, best_feature]):
            mask = X[:, best_feature] == value
            
            X_subset = np.delete(X[mask], best_feature, axis=1)
            y_subset = y[mask]
            
            subtree = self._build_tree(
                X_subset, 
                y_subset, 
                depth=depth+1
            )
            
            node['children'][value] = subtree
        
        return node
    
    def fit(self, X, y):
        X, y = check_X_y(X, y)

        self.tree_ = self._build_tree(X, y)
        
        self.n_features_in_ = X.shape[1]
        self.classes_ = np.unique(y)
        
        return self
    
    def predict(self, X):
        X = check_array(X)
        check_is_fitted(self, ['tree_', 'classes_'])
        
        predictions = [self._predict_single(x) for x in X]
        
        return predictions
    
    def predict_proba(self, X):
        X = check_array(X)
        check_is_fitted(self, ['tree_', 'classes_'])
        
        predictions = [self._predict_single(x) for x in X]
        
        proba = np.zeros((X.shape[0], len(self.classes_)))
        for i, p in enumerate(predictions):
            proba[i, p] = 1
        
        return proba

    def _predict_single(self, x):
        node = self.tree_
        
        while isinstance(node, dict):
            feature = node['feature']
            value = x[feature]
            
            if value not in node['children']:
                node = list(node['children'].values())[0]
            else:
                node = node['children'][value]

                x = np.delete(x, feature)
        
        return node

    def score(self, X, y):
        predictions = self.predict(X)
        return np.mean(predictions == y)

In [106]:
clf = ID3(max_depth=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
predict_proba = clf.predict_proba(X_test)
cross_val = cross_val_score(clf, X, y, cv=5)
print(f"Score: {score}")
print(f"Cross Val: {cross_val}")

df = pd.DataFrame()
df['attack_cat'] = le.inverse_transform(y_test)
df['predicted'] = le.inverse_transform(clf.predict(X_test))
df.head()

Score: 0.6437076815316367
Cross Val: [0.64534205 0.63390147 0.64043894 0.64043894 0.63810413]


Unnamed: 0,attack_cat,predicted
0,Exploits,Normal
1,Exploits,Exploits
2,Exploits,Normal
3,Reconnaissance,Reconnaissance
4,Normal,Normal
