In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

In [2]:
dataset=pd.read_csv("/content/sample_data/kdd20_with_columns.csv")

In [3]:
dataset.isnull().sum()

Unnamed: 0,0
duration,0
protocol_type,0
service,0
flag,0
src_bytes,0
dst_bytes,0
land,0
wrong_fragment,0
urgent,0
hot,0


In [4]:
encoder=OneHotEncoder(handle_unknown='ignore')
transformed_array1=encoder.fit_transform(dataset.iloc[:,1:2]).toarray()
categories1=encoder.categories_
transformed_array1,categories1

(array([[0., 1., 0.],
        [0., 0., 1.],
        [0., 1., 0.],
        ...,
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.]]),
 [array(['icmp', 'tcp', 'udp'], dtype=object)])

In [5]:
transformed_array2=encoder.fit_transform(dataset.iloc[:,2:3]).toarray()
categories2=encoder.categories_
transformed_array2,categories2

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 [array(['IRC', 'X11', 'Z39_50', 'auth', 'bgp', 'courier', 'csnet_ns',
         'ctf', 'daytime', 'discard', 'domain', 'domain_u', 'echo', 'eco_i',
         'ecr_i', 'efs', 'exec', 'finger', 'ftp', 'ftp_data', 'gopher',
         'hostnames', 'http', 'http_443', 'http_8001', 'imap4', 'iso_tsap',
         'klogin', 'kshell', 'ldap', 'link', 'login', 'mtp', 'name',
         'netbios_dgm', 'netbios_ns', 'netbios_ssn', 'netstat', 'nnsp',
         'nntp', 'ntp_u', 'other', 'pm_dump', 'pop_2', 'pop_3', 'printer',
         'private', 'red_i', 'remote_job', 'rje', 'shell', 'smtp',
         'sql_net', 'ssh', 'sunrpc', 'supdup', 'systat', 'telnet', 'tim_i',
         'time', 'urh_i', 'urp_i', 'uucp', 'uucp_path', 'vmnet', 'whois'],
        dtype=object)

In [6]:
transformed_array3=encoder.fit_transform(dataset.iloc[:,3:4]).toarray()
categories3=encoder.categories_
transformed_array3,categories3

(array([[0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 [array(['OTH', 'REJ', 'RSTO', 'RSTOS0', 'RSTR', 'S0', 'S1', 'S2', 'S3',
         'SF', 'SH'], dtype=object)])

In [7]:
transformed_array4=encoder.fit_transform(dataset.iloc[:,41:42]).toarray()
categories4=encoder.categories_
transformed_array4,categories4

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 [array(['back', 'buffer_overflow', 'ftp_write', 'guess_passwd', 'imap',
         'ipsweep', 'land', 'loadmodule', 'multihop', 'neptune', 'nmap',
         'normal', 'phf', 'pod', 'portsweep', 'rootkit', 'satan', 'smurf',
         'spy', 'teardrop', 'warezclient', 'warezmaster'], dtype=object)])

In [8]:
dataset[['label','difficulty_level']]=dataset[['difficulty_level','label']]

In [9]:
dataset.rename(columns={'label': 'difficulty_level', 'difficulty_level': 'label'}, inplace=True)

In [10]:
dataset.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,difficulty_level,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,20,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,15,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,19,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,21,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21,normal


In [11]:
features=dataset.drop(columns=['label'])
labels=dataset['label']
dataset_encoded=pd.get_dummies(dataset, columns=['protocol_type', 'service', 'flag', 'label'], drop_first=True)
features=dataset_encoded.drop(columns=['label_neptune', 'label_normal'])
labels=dataset_encoded[['label_neptune', 'label_normal']]
scaler=MinMaxScaler()
features_scaled=scaler.fit_transform(features)
dataset.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,difficulty_level,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,20,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,15,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,19,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,21,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21,normal


In [12]:
dataset['difficulty_level'].max()

21

Using LSTM

In [13]:
sequence_length=10
x,y=[],[]
for i in range(len(features_scaled) - sequence_length):
    x.append(features_scaled[i:i + sequence_length])
    y.append(labels.iloc[i + sequence_length].values)

x,y=np.array(x),np.array(y)
X_train, X_test, y_train, y_test=train_test_split(x, y, test_size=0.2, random_state=42)

In [14]:
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(y_train.shape[1], activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
early_stopping=EarlyStopping(monitor='val_loss', patience=5)
history=model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])
loss, accuracy=model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')
predictions=model.predict(X_test)
predicted_labels=(predictions > 0.5).astype(int)

Epoch 1/10
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 29ms/step - accuracy: 0.5274 - loss: 0.6691 - val_accuracy: 0.5453 - val_loss: 0.6602
Epoch 2/10
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 34ms/step - accuracy: 0.5175 - loss: 0.6668 - val_accuracy: 0.5453 - val_loss: 0.6589
Epoch 3/10
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 29ms/step - accuracy: 0.5294 - loss: 0.6644 - val_accuracy: 0.5453 - val_loss: 0.6593
Epoch 4/10
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 29ms/step - accuracy: 0.5229 - loss: 0.6656 - val_accuracy: 0.5453 - val_loss: 0.6590
Epoch 5/10
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 29ms/step - accuracy: 0.5339 - loss: 0.6612 - val_accuracy: 0.5450 - val_loss: 0.6614
Epoch 6/10
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 33ms/step - accuracy: 0.5265 - loss: 0.6624 - val_accuracy: 0.5453 - val_loss: 0.6598
Epoch 7/10
[1m5

In [15]:
print(predicted_labels)

[[0 1]
 [0 1]
 [0 1]
 ...
 [0 1]
 [0 1]
 [0 1]]


The Accuracy is too low for LSTM.....switching to supervised models

Using Supervised Model

In [16]:
X = dataset.drop(columns=['label'])
y = dataset['label']
numerical_cols=X.select_dtypes(include=['float64', 'int']).columns.tolist()
categorical_cols=X.select_dtypes(include=['object']).columns.tolist()
preprocessor=ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=42)
models = {
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'SVM': SVC(),
    'Gradient Boosting': GradientBoostingClassifier()
}
for model_name, model in models.items():
    pipeline=Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    pipeline.fit(X_train, y_train)
    y_pred=pipeline.predict(X_test)
    accuracy=accuracy_score(y_test, y_pred)
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))
    results_df=pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
    print(results_df.head())
    print('-' * 50)


Model: Random Forest
Accuracy: 0.9976185751141099
                 precision    recall  f1-score   support

           back       1.00      1.00      1.00        36
buffer_overflow       1.00      1.00      1.00         1
   guess_passwd       1.00      1.00      1.00         2
        ipsweep       1.00      0.99      1.00       147
     loadmodule       0.00      0.00      0.00         1
        neptune       1.00      1.00      1.00      1694
           nmap       0.98      1.00      0.99        51
         normal       1.00      1.00      1.00      2674
            pod       1.00      1.00      1.00        10
      portsweep       0.99      0.99      0.99       117
          satan       0.99      0.98      0.98       130
          smurf       1.00      1.00      1.00       114
            spy       0.00      0.00      0.00         1
       teardrop       1.00      1.00      1.00        31
    warezclient       0.93      0.96      0.95        28
    warezmaster       1.00      1.00 

In [17]:
from sklearn.model_selection import GridSearchCV
param_grid={
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20]
}
grid_search = GridSearchCV(Pipeline(steps=[('preprocessor', preprocessor),
                                            ('classifier', RandomForestClassifier())]),
                            param_grid,
                            cv=5)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_


In [18]:
from sklearn.metrics import roc_auc_score
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

                 precision    recall  f1-score   support

           back       1.00      1.00      1.00        36
buffer_overflow       1.00      1.00      1.00         1
   guess_passwd       1.00      1.00      1.00         2
        ipsweep       0.99      0.99      0.99       147
     loadmodule       0.00      0.00      0.00         1
        neptune       1.00      1.00      1.00      1694
           nmap       0.96      1.00      0.98        51
         normal       1.00      1.00      1.00      2674
            pod       1.00      1.00      1.00        10
      portsweep       0.99      0.99      0.99       117
          satan       0.99      0.98      0.99       130
          smurf       1.00      1.00      1.00       114
            spy       0.00      0.00      0.00         1
       teardrop       1.00      1.00      1.00        31
    warezclient       0.93      0.96      0.95        28
    warezmaster       1.00      1.00      1.00         2

       accuracy              

In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, classification_report

dataset = pd.read_csv("/content/sample_data/kdd20_with_columns.csv")

X = dataset.drop(columns=['label', 'difficulty_level'])
y = dataset[['label', 'difficulty_level']]

y['label'] = y['label'].astype('category').cat.codes
y['difficulty_level'] = y['difficulty_level'].astype('category').cat.codes

numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(SVC(random_state=42)))
])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [20]:
import pandas as pd
import math
data = {
    'duration': [1, 23, 0, 12, 7, 14],
    'protocol_type': ['udp', 'tcp', 'tcp', 'udp', 'tcp', 'tcp'],
    'service': ['http', 'private', 'http', 'ftp', 'other', 'private'],
    'flag': ['SF', 'REJ', 'SF', 'SF', 'S0', 'SF'],
    'src_bytes': [231, 56, 120, 454, 890, 345],
    'dst_bytes': [1200, 500, 0, 1050, 345, 678],
    'land': [0, 0, 0, 1, 0, 0],
    'wrong_fragment': [0, 1, 0, 0, 0, 1],
    'urgent': [1, 0, 0, 0, 0, 0],
    'hot': [0, 1, 0, 0, 0, 0],
    'num_failed_logins': [0, 2, 0, 0, 1, 0],
    'logged_in': [1, 0, 0, 1, 0, 0],
    'num_compromised': [0, 1, 0, 0, 0, 0],
    'root_shell': [0, 0, 1, 0, 0, 0],
    'su_attempted': [0, 1, 0, 0, 0, 0],
    'num_root': [1, 0, 0, 0, 0, 1],
    'num_file_creations': [0, 0, 1, 0, 0, 0],
    'num_shells': [0, 0, 0, 1, 0, 0],
    'num_access_files': [0, 1, 0, 0, 0, 0],
    'num_outbound_cmds': [0, 0, 0, 0, 1, 0],
    'is_host_login': [0, 0, 1, 0, 0, 0],
    'is_guest_login': [1, 0, 0, 0, 1, 0],
    'count': [5, 20, 30, 2, 45, 35],
    'srv_count': [2, 3, 7, 4, 1, 8],
    'serror_rate': [0.1, 0.3, 1, 0.2, 0.4, 0],
    'srv_serror_rate': [0.2, 0.1, 0.8, 0.2, 0, 0.5],
    'rerror_rate': [0.2, 0.5, 0.1, 0.3, 0.4, 0],
    'srv_rerror_rate': [0.3, 0.2, 0.4, 0.1, 0, 0.6],
    'same_srv_rate': [0.6, 0.5, 0.4, 0.3, 1, 0.2],
    'diff_srv_rate': [0.4, 0.1, 0.2, 0.3, 0, 0.5],
    'srv_diff_host_rate': [0, 0.2, 0.1, 0.3, 0.4, 0.5],
    'dst_host_count': [200, 255, 150, 190, 255, 100],
    'dst_host_srv_count': [100, 80, 90, 70, 50, 60],
    'dst_host_same_srv_rate': [0.8, 0.9, 0.7, 0.6, 1, 0.5],
    'dst_host_diff_srv_rate': [0.2, 0.1, 0.3, 0.4, 0, 0.6],
    'dst_host_same_src_port_rate': [0.5, 0.4, 0.6, 0.7, 0.3, 0.2],
    'dst_host_srv_diff_host_rate': [0.1, 0.2, 0.3, 0.4, 0.1, 0.2],
    'dst_host_serror_rate': [0, 0.2, 0.3, 0.4, 0, 0.1],
    'dst_host_srv_serror_rate': [0.2, 0.1, 0.4, 0.3, 0.5, 0],
    'dst_host_rerror_rate': [0.1, 0.3, 0.2, 0.4, 0.1, 0.5],
    'dst_host_srv_rerror_rate': [0.3, 0.1, 0.2, 0.3, 0.4, 0.5]
}
new_data = pd.DataFrame(data)
new_data_preprocessed = model.named_steps['preprocessor'].transform(new_data)
predictions = model.named_steps['classifier'].predict(new_data_preprocessed)
difficulty_level_pred = predictions[:, 1]
difficulty_level_pred_floor = [math.floor(dl / 2) for dl in difficulty_level_pred]
print("Predicted difficulty levels:", difficulty_level_pred_floor)


Predicted difficulty levels: [10, 9, 9, 9, 10, 9]
