# Import libraries and frameworks

In [1]:
!pip install tabtransformertf
!pip install tensorflow-addons
from IPython.display import clear_output
clear_output()

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_addons as tfa
from sklearn.metrics import precision_score, recall_score, f1_score
from tabtransformertf.utils.preprocessing import df_to_dataset, build_categorical_prep
from tabtransformertf.models.fttransformer import FTTransformerEncoder, FTTransformer
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

        

/kaggle/input/cicids2017/MachineLearningCSV.md5
/kaggle/input/cicids2017/MachineLearningCSV/MachineLearningCVE/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
/kaggle/input/cicids2017/MachineLearningCSV/MachineLearningCVE/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
/kaggle/input/cicids2017/MachineLearningCSV/MachineLearningCVE/Tuesday-WorkingHours.pcap_ISCX.csv
/kaggle/input/cicids2017/MachineLearningCSV/MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
/kaggle/input/cicids2017/MachineLearningCSV/MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv
/kaggle/input/cicids2017/MachineLearningCSV/MachineLearningCVE/Friday-WorkingHours-Morning.pcap_ISCX.csv
/kaggle/input/cicids2017/MachineLearningCSV/MachineLearningCVE/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
/kaggle/input/cicids2017/MachineLearningCSV/MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv
/kaggle/input/network-intrusion-detection/Train_data.csv
/kaggle/input/network-intr

# Data Preprocessing 

In [4]:
data_df = pd.read_csv('/kaggle/input/network-intrusion-detection/Train_data.csv')
# Convert 'normal' to 0 and 'anomaly' to 1
data_df['class'] = data_df['class'].apply(lambda x: 0 if x == 'normal' else 1)

In [5]:
CATEGORICAL_FEATURES = ['protocol_type', 'service', 'flag']


NUMERIC_FEATURES = [
    'duration',
    'src_bytes',
    'dst_bytes',
    'land',
    'wrong_fragment',
    'urgent',
    'hot',
    'num_failed_logins',
    'logged_in',
    'num_compromised',
    'root_shell',
    'su_attempted',
    'num_root',
    'num_file_creations',
    'num_shells',
    'num_access_files',
    'num_outbound_cmds',
    'is_host_login',
    'is_guest_login',
    'count',
    'srv_count',
    'serror_rate',
    'srv_serror_rate',
    'rerror_rate',
    'srv_rerror_rate',
    'same_srv_rate',
    'diff_srv_rate',
    'srv_diff_host_rate',
    'dst_host_count',
    'dst_host_srv_count',
    'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate',
    'dst_host_srv_serror_rate',
    'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate'
]
FEATURES = CATEGORICAL_FEATURES + NUMERIC_FEATURES
TARGET_FEATURE = 'class'

sc = StandardScaler()
sc.fit(data_df[NUMERIC_FEATURES])
data_df[NUMERIC_FEATURES]= sc.transform(data_df[NUMERIC_FEATURES])

In [6]:
train_data, val_data = train_test_split(data_df, test_size=0.33, shuffle=True, random_state=8)

val_data,test_data = train_test_split(val_data, test_size=0.66, shuffle=True, random_state=12)
print("Train samples:", len(train_data))
print("Valid samples:", len(val_data))
print("Test samples:", len(test_data))


Train samples: 16878
Valid samples: 2826
Test samples: 5488


In [7]:
# Transform to TF dataset
train_dataset = df_to_dataset(train_data[FEATURES + [TARGET_FEATURE]], TARGET_FEATURE, shuffle=True, batch_size=256)
val_dataset = df_to_dataset(val_data[FEATURES + [TARGET_FEATURE]], TARGET_FEATURE, shuffle=False, batch_size=256)
test_dataset = df_to_dataset(test_data[FEATURES + [TARGET_FEATURE]], TARGET_FEATURE, shuffle=False, batch_size=256)

category_prep_layers = build_categorical_prep(train_data, CATEGORICAL_FEATURES)

  dataset[key] = value[:, tf.newaxis]
100%|██████████| 3/3 [00:00<00:00, 54.65it/s]


In [8]:
def train_model(model):
    epochs = 3
    lr = 0.001
    weight_decay = 0.0001
    optimizer = tfa.optimizers.AdamW(
            learning_rate=lr, weight_decay=weight_decay
        )

    model.compile(
            optimizer=optimizer,
            loss=tf.keras.losses.BinaryCrossentropy(),
            metrics=[tf.keras.metrics.BinaryAccuracy(name="binary_accuracy")],
        )

    early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy",patience=10)

    history = model.fit(
        train_dataset, 
        epochs=epochs, 
        validation_data=val_dataset,
        callbacks=[early_stopping]
    )
    return model

def print_metrics(true_labels, predicted_labels):
    precision = precision_score(true_labels, predicted_labels)

    recall = recall_score(true_labels, predicted_labels)

    f1 = f1_score(true_labels, predicted_labels)

    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    


# FTTransformer

In [9]:
ft_linear_encoder = FTTransformerEncoder(
    numerical_features=NUMERIC_FEATURES,  # list of numeric features
    categorical_features=CATEGORICAL_FEATURES,  # list of numeric features
    numerical_data=data_df[NUMERIC_FEATURES].values,
    categorical_data=data_df[CATEGORICAL_FEATURES].values,
    y = None,
    numerical_embedding_type='linear',
    embedding_dim=32,
    depth=3,
    heads=6,
    attn_dropout=0.3,
    ff_dropout=0.3,
    explainable=True
)

# Pass the encoder to the model
ft_model = FTTransformer(
    encoder=ft_linear_encoder,  # Encoder from above
    out_dim=1,  # Number of classes
    out_activation='sigmoid',  # Softmax activation for multi-class 
)

In [10]:
ft_model = train_model(ft_model)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [11]:
predicted_probabilities = ft_model.predict(test_dataset)

predicted_labels = (predicted_probabilities['output'] >= 0.5).astype(int)  # Assuming binary classification

# Assuming you have the true labels for the validation data in a variable 'true_labels'
true_labels = np.array(test_data['class'])  # Convert true_labels to a numpy array if it's not already

print_metrics(true_labels, predicted_labels)

Precision: 0.988714228133817
Recall: 0.9804156674660272
F1 Score: 0.9845474613686533


# Data Preprocessing for Other Models

In [12]:
data_encoded = data_df.drop(['protocol_type', 'service', 'flag'], axis=1)
train_data_encoded, val_data_encoded = train_test_split(data_encoded, test_size=0.33, shuffle=True, random_state=8)

val_data_encoded,test_data_encoded = train_test_split(val_data_encoded, test_size=0.66, shuffle=True, random_state=12)
print("Train samples:", len(train_data_encoded))
print("Valid samples:", len(val_data_encoded))
print("Test samples:", len(test_data_encoded))

Train samples: 16878
Valid samples: 2826
Test samples: 5488


# SVM Model

In [13]:
svm_model = SVC(kernel='linear')
svm_model.fit(train_data_encoded.drop(['class'], axis=1), train_data_encoded['class'])

# Make predictions on the test data
test_predictions = svm_model.predict(test_data_encoded.drop(['class'], axis=1))
print_metrics(true_labels, test_predictions)

Precision: 0.9569034317637669
Recall: 0.9584332533972821
F1 Score: 0.9576677316293929


# LR model

In [14]:
lr_model = LogisticRegression(max_iter=400)
lr_model.fit(train_data_encoded.drop(['class'], axis=1), train_data_encoded['class'])

# Make predictions on the test data
test_predictions = lr_model.predict(test_data_encoded.drop(['class'], axis=1))
print_metrics(true_labels, test_predictions)

Precision: 0.9415532425940752
Recall: 0.9400479616306955
F1 Score: 0.9408


# MLP Model

In [15]:
mlp_model = MLPClassifier(hidden_layer_sizes=(20, 10), max_iter=10, random_state=42)
mlp_model.fit(train_data_encoded.drop(['class'], axis=1), train_data_encoded['class'])

test_predictions = mlp_model.predict(test_data_encoded.drop(['class'], axis=1))

print_metrics(true_labels, test_predictions)

Precision: 0.971266693646297
Recall: 0.9592326139088729
F1 Score: 0.9652121455861653




# Voting Model

In [16]:
v_clf = VotingClassifier(estimators=[("SVM", svm_model), ("Logistic Regression", lr_model), ("MLP", mlp_model)], voting = "hard")
v_clf.fit(train_data_encoded.drop(['class'], axis=1), train_data_encoded['class'])

# Make predictions on the validation data
test_predictions = v_clf.predict(test_data_encoded.drop(['class'], axis=1))

print_metrics(true_labels, test_predictions)



Precision: 0.9552960128876359
Recall: 0.9480415667466027
F1 Score: 0.951654964894684
