In [6]:
import pandas as pd
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
# Fungsi untuk membersihkan teks (jika diperlukan)
def clean_text(text):
    # Hapus karakter non-alfanumerik dan ubah ke huruf kecil
    text = re.sub(r'\W', ' ', text)
    return text.lower()

In [8]:
# 1. Muat dataset
df = pd.read_csv('train_test_network.csv', on_bad_lines='skip')

In [9]:
# 2. Feature Selection
features = df[['src_ip', 'dst_ip', 'proto', 'src_bytes', 'dst_bytes', 'src_pkts', 'dst_pkts', 'duration']]
labels = df['label'].apply(lambda x: 1 if x == 'DDoS' else 0)

In [10]:
# 3. Handling Categorical Features
encoder = LabelEncoder()
for col in ['src_ip', 'dst_ip', 'proto']:
    features[col] = encoder.fit_transform(features[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features[col] = encoder.fit_transform(features[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features[col] = encoder.fit_transform(features[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features[col] = encoder.fit_transform(features[col])


In [11]:
# 4. Gabungkan fitur menjadi representasi teks
texts = features.astype(str).agg(' '.join, axis=1).tolist()

In [12]:
# 5. Membersihkan teks (opsional)
texts = [clean_text(text) for text in texts]

In [13]:
# 6. Tokenisasi teks menggunakan Tokenizer dari Keras
tokenizer = Tokenizer(num_words=5000)  # Batasi ke 5000 kata yang paling sering muncul
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [14]:
# Padding sequences agar memiliki panjang yang seragam
max_len = 128
X = pad_sequences(sequences, maxlen=max_len)

In [15]:
# 7. Bagi dataset menjadi data latih dan uji
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)

In [16]:
# 8. Membangun model GRU
def build_gru_model(input_length):
    model = models.Sequential()
    model.add(layers.Embedding(input_dim=5000, output_dim=128, input_length=input_length))
    model.add(layers.GRU(units=64, return_sequences=False))
    model.add(layers.Dense(1, activation='sigmoid'))  # Untuk klasifikasi biner

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [17]:
# Inisialisasi model GRU
gru_model = build_gru_model(max_len)



In [19]:
# 9. Melatih model
gru_model.fit(X_train, y_train, epochs=3, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/3
[1m4617/4617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m543s[0m 118ms/step - accuracy: 1.0000 - loss: 9.0805e-09 - val_accuracy: 1.0000 - val_loss: 1.8380e-09
Epoch 2/3
[1m4617/4617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m544s[0m 118ms/step - accuracy: 1.0000 - loss: 1.2870e-09 - val_accuracy: 1.0000 - val_loss: 4.4152e-10
Epoch 3/3
[1m4617/4617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m562s[0m 118ms/step - accuracy: 1.0000 - loss: 3.5658e-10 - val_accuracy: 1.0000 - val_loss: 2.0132e-10


<keras.src.callbacks.history.History at 0x7c844a0eb790>

In [20]:
# 10. Evaluasi model
test_loss, test_acc = gru_model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_acc * 100:.2f}%')

[1m1979/1979[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 24ms/step - accuracy: 1.0000 - loss: 2.0130e-10
Test Accuracy: 100.00%


In [21]:
# 11. Prediksi dan laporan klasifikasi
y_pred = (gru_model.predict(X_test) > 0.5).astype("int32")  # Mengonversi probabilitas ke label biner
print(classification_report(y_test, y_pred))

[1m1979/1979[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 24ms/step
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     63313

    accuracy                           1.00     63313
   macro avg       1.00      1.00      1.00     63313
weighted avg       1.00      1.00      1.00     63313



In [22]:
# 12. Real-time DDoS Detection
def real_time_inference(model, traffic_data, tokenizer, max_len):
    # Tokenize and pad incoming traffic data
    sequence = tokenizer.texts_to_sequences([traffic_data])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)

    # Predict the result
    prediction = model.predict(padded_sequence)
    if prediction > 0.5:
        return "DDoS Attack Detected!"
    else:
        return "Normal Traffic"

# Contoh penggunaan (misal dari input pengguna real-time)
sample_traffic = "192.168.0.1 192.168.0.2 TCP 500 1000 10 20 60"
print(real_time_inference(gru_model, sample_traffic, tokenizer, max_len))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
Normal Traffic
