In [221]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Data loading

In [222]:
cols_path = "data/Field_Names.csv"   # update path if needed

# quick preview (text)
with open(cols_path, "r", encoding="utf-8", errors="replace") as f:
    for i, line in enumerate(f):
        print(i+1, line.strip())
        if i >= 9:
            break


1 duration,continuous
2 protocol_type,symbolic
3 service,symbolic
4 flag,symbolic
5 src_bytes,continuous
6 dst_bytes,continuous
7 land,continuous
8 wrong_fragment,continuous
9 urgent,continuous
10 hot,continuous


In [223]:
dir = "data"
cols_path = os.path.join(dir, "Field_Names.csv")
cols = pd.read_csv(cols_path)['duration'].tolist()
cols.append("label")

In [224]:
train_path = "data/KDDTrain.txt"
test_path = "data/KDDTest.txt"

df_train = pd.read_csv(train_path, names=cols)
df_test = pd.read_csv(test_path, names=cols)

In [225]:
categorical_columns = df_train.select_dtypes(include=['object']).columns.tolist()

In [226]:
categorical_columns

['protocol_type', 'service', 'dst_host_srv_rerror_rate']

In [227]:
df = pd.concat([df_train, df_test]).reset_index(drop=True)
df.head()

Unnamed: 0,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,ftp_data,SF,491,0,0,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,other,SF,146,0,0,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,private,S0,0,0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,http,SF,232,8153,0,0,0,0,0,1,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,http,SF,199,420,0,0,0,0,0,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [228]:
df = pd.get_dummies(df, columns=categorical_columns)

In [229]:
df.head()

Unnamed: 0,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_srv_rerror_rate_spy,dst_host_srv_rerror_rate_sqlattack,dst_host_srv_rerror_rate_teardrop,dst_host_srv_rerror_rate_udpstorm,dst_host_srv_rerror_rate_warezclient,dst_host_srv_rerror_rate_warezmaster,dst_host_srv_rerror_rate_worm,dst_host_srv_rerror_rate_xlock,dst_host_srv_rerror_rate_xsnoop,dst_host_srv_rerror_rate_xterm
0,491,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
1,146,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
2,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
3,232,8153,0,0,0,0,0,1,0,0,...,False,False,False,False,False,False,False,False,False,False
4,199,420,0,0,0,0,0,1,0,0,...,False,False,False,False,False,False,False,False,False,False


In [230]:
categorical_columns = df_train.select_dtypes(include=['object']).columns.tolist()

In [231]:
categorical_columns

['protocol_type', 'service', 'dst_host_srv_rerror_rate']

In [232]:
df_train.head()

Unnamed: 0,Unnamed: 1,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,tcp,ftp_data,SF,491,0,0,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
0,udp,other,SF,146,0,0,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
0,tcp,private,S0,0,0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
0,tcp,http,SF,232,8153,0,0,0,0,0,1,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
0,tcp,http,SF,199,420,0,0,0,0,0,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [233]:
X = df.drop("label", axis=1).values
y = df["label"].values

In [234]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [235]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [236]:
y_val.shape

(44556,)

In [237]:
n_features = X_train.shape[1]
n_features

158

### GA Parameters

In [238]:
POP_SIZE = 10
GENERATIONS = 8
MUT_RATE = 0.1

### GA Functions

In [239]:
def init_population():
    return np.random.randint(0, 2, size=(POP_SIZE, n_features))

def fitness(chrom):
    
    if chrom.sum() == 0:
        return 0
    
    X_tr = X_train[:, chrom == 1]
    X_vl = X_val[:, chrom == 1]

    clf = DecisionTreeClassifier()
    clf.fit(X_tr, y_train)
    y_pred = clf.predict(X_vl)

    return accuracy_score(y_val, y_pred)

def crossover(p1, p2):
    cut = np.random.randint(1, n_features-1)
    c1 = np.concatenate([p1[:cut], p2[cut:]])
    c2 = np.concatenate([p2[:cut], p1[cut:]])
    return c1, c2

def mutate(chrom):    
    for i in range(n_features):
        if np.random.rand() < MUT_RATE:
            chrom[i] = 1 - chrom[i]
    return chrom    

### GA loop

In [240]:
pop = init_population()

In [241]:
fitnesses = np.zeros(POP_SIZE)

In [242]:
pop

array([[0, 0, 0, ..., 0, 1, 1],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 1, ..., 0, 1, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 1]], shape=(10, 158), dtype=int32)

In [243]:
for gen in range(GENERATIONS):
    
    for i in range(POP_SIZE):
        fitnesses[i] = fitness(pop[i])

    best_idx = np.argsort(fitnesses)[-2:]
    parent1, parent2 = pop[best_idx]

    new_pop = [parent1, parent2]

    while len(new_pop) < POP_SIZE:
        c1, c2 = crossover(parent1, parent2)
        c1 = mutate(c1)
        c2 = mutate(c2) 
        new_pop += [c1, c2]

    pop = np.array(new_pop[:POP_SIZE])
    print(f"Generation {gen} best accuracy = {fitnesses[best_idx[-1]]:.4f}")

Generation 0 best accuracy = 0.7703
Generation 1 best accuracy = 0.7893
Generation 2 best accuracy = 0.8454
Generation 3 best accuracy = 0.8523
Generation 4 best accuracy = 0.8520
Generation 5 best accuracy = 0.8605
Generation 6 best accuracy = 0.8600
Generation 7 best accuracy = 0.8595


### Final best chromosome

In [244]:
best_idx = np.argmax(fitnesses)
best_chrom = pop[best_idx]

print("\nBest feature mask:\n", best_chrom)
print("Selected features: ", best_chrom.sum())


Best feature mask:
 [1 0 0 0 1 1 1 1 1 0 0 1 0 1 0 1 1 0 1 1 0 1 1 1 0 0 0 1 1 0 1 1 0 0 1 1 1
 1 0 1 0 1 0 0 1 0 1 0 1 0 0 1 0 0 0 0 0 1 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 0
 1 0 1 0 1 1 1 0 0 1 1 1 0 1 0 1 1 1 0 0 0 1 1 1 0 0 1 1 1 1 0 0 1 0 0 0 0
 0 0 0 1 1 0 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 0 1 1 0 0 1 0 1 1 0 0 1 0 0 0 1
 1 1 1 0 1 1 1 0 0 1]
Selected features:  86


### Model accuracy

In [245]:
clf_final = DecisionTreeClassifier()
clf_final.fit(X_train[:, best_chrom==1], y_train)
pred = clf_final.predict(X_val[:, best_chrom==1])
acc = accuracy_score(y_val, pred)
print("GA_IDS accuracy: ", acc)

GA_IDS accuracy:  0.8125504982493941
