In [64]:
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

df_train = pd.read_csv('KDDTrain+.txt')
df_test = pd.read_csv('KDDTest+.txt')

columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land',
            'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
            'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
            'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count',
            'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
            'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
            'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
            'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
            'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack',
            'level']

df_train.columns = columns
df_test.columns = columns

df_train.drop('level', axis=1, inplace=True)
df_test.drop('level', axis=1, inplace=True)

df_train.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack
0,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
1,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
2,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
3,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
4,0,tcp,private,REJ,0,0,0,0,0,0,...,19,0.07,0.07,0.0,0.0,0.0,0.0,1.0,1.0,neptune


In [65]:
print(df_train.shape)
print(df_test.shape)
print(len(columns))

(125972, 42)
(22543, 42)
43


In [66]:
ddos_types = ['apache2','back','land','neptune','mailbomb','pod','processtable',
              'smurf', 'teardrop','udpstorm','worm']

is_ddos_train = []

for attack in df_train['attack']:
    if attack in ddos_types:
        is_ddos_train.append(1)
    else:
        is_ddos_train.append(0)

df_train['is_ddos'] = is_ddos_train

is_ddos_test = []

for attack in df_test['attack']:
    if attack in ddos_types:
        is_ddos_test.append(1)
    else:
        is_ddos_test.append(0)

df_test['is_ddos'] = is_ddos_test

df_train.drop('attack', axis=1, inplace=True)
df_test.drop('attack', axis=1, inplace=True)

df_train.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,is_ddos
0,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,0
1,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,1
2,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,0
3,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0,tcp,private,REJ,0,0,0,0,0,0,...,19,0.07,0.07,0.0,0.0,0.0,0.0,1.0,1.0,1


In [67]:
max(df_train.isnull().sum())

0

In [68]:
max(df_test.isnull().sum())

0

In [69]:
df_train['is_ddos'].value_counts()

is_ddos
0    80045
1    45927
Name: count, dtype: int64

In [70]:
protocol_encoding_train = pd.get_dummies(df_train['protocol_type'], dtype=int)
service_encoding_train = pd.get_dummies(df_train['service'], dtype=int)
flag_encoding_train = pd.get_dummies(df_train['flag'], dtype=int)

df_train = df_train.join(protocol_encoding_train)
df_train = df_train.join(service_encoding_train)
df_train = df_train.join(flag_encoding_train)


protocol_encoding_test = pd.get_dummies(df_test['protocol_type'], dtype=int)
service_encoding_test = pd.get_dummies(df_test['service'], dtype=int)
flag_encoding_test = pd.get_dummies(df_test['flag'], dtype=int)

df_test = df_test.join(protocol_encoding_test)
df_test = df_test.join(service_encoding_test)
df_test = df_test.join(flag_encoding_test)

df_train.drop(['protocol_type', 'service', 'flag'], inplace=True, axis=1)
df_test.drop(['protocol_type', 'service', 'flag'], inplace=True, axis=1)

is_ddos_column_train = df_train.pop('is_ddos')
df_train['is_ddos'] = is_ddos_column_train

is_ddos_column_test = df_test.pop('is_ddos')
df_test['is_ddos'] = is_ddos_column_test

df_train.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,RSTO,RSTOS0,RSTR,S0,S1,S2,S3,SF,SH,is_ddos
0,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,0,199,420,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [72]:
print(df_train.shape)
print(df_test.shape)

(125972, 123)
(22543, 117)


In [73]:
columns_train = list(df_train.columns)
columns_test = list(df_test.columns)

for column in columns_train:
    if column not in columns_test:
        df_train.pop(column)

In [74]:
print(df_train.shape)
print(df_test.shape)

(125972, 117)
(22543, 117)


In [75]:
scaler = StandardScaler()

Y_train = df_train['is_ddos']
df_train.drop('is_ddos', inplace=True, axis=1)
df_train_scaled = scaler.fit_transform(df_train)
X_train = df_train_scaled

Y_test = df_test['is_ddos']
df_test.drop('is_ddos', inplace=True, axis=1)
df_test_scaled = scaler.fit_transform(df_test)
X_test = df_test_scaled

In [79]:
print(X_train.shape)
print(X_test.shape)

(125972, 116)
(22543, 116)
(22543,)


In [82]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(64, activation='relu', input_dim = X_train.shape[1]))
model.add(tf.keras.layers.Dense(32, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 64)                7488      
                                                                 
 dense_10 (Dense)            (None, 32)                2080      
                                                                 
 dense_11 (Dense)            (None, 1)                 33        
                                                                 
Total params: 9601 (37.50 KB)
Trainable params: 9601 (37.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [83]:
history = model.fit(X_train, Y_train, epochs=30, batch_size=32)

Epoch 1/30


2023-12-29 19:43:03.149368: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 58451008 exceeds 10% of free system memory.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [84]:
model.evaluate(X_test, Y_test)



[2.2333197593688965, 0.9297786355018616]