In [None]:
import os
from google.colab import files

# Download dataset
os.chdir('/content')
files.upload()  # Upload the kaggle.json file

# Move kaggle.json to the correct location
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download the dataset
!kaggle datasets download -d hassan06/nslkdd -p /content/dataset/ --unzip

Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/hassan06/nslkdd
License(s): unknown
Downloading nslkdd.zip to /content/dataset
 65% 9.00M/13.9M [00:00<00:00, 77.1MB/s]
100% 13.9M/13.9M [00:00<00:00, 90.7MB/s]


In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

pd.set_option('display.max_columns', None)

# ======================
# Data Loading and Preprocessing
# ======================

# Define column names for the dataset
columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
    'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files',
    'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count',
    'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate',
    'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
    'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
    'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate', 'label', 'difficulty_level'
]

# Load training data
train_df = pd.read_csv('dataset/nsl-kdd/KDDTrain+.txt', names=columns, index_col=False)

# Load test data
test_df = pd.read_csv('dataset/nsl-kdd/KDDTest+.txt', names=columns, index_col=False)

# Combine train and test data for consistent preprocessing
data_df = pd.concat([train_df, test_df], ignore_index=True)

# Drop difficulty level column
data_df = data_df.drop('difficulty_level', axis = 1)

# Drop any columns with only one unique value (if any)
for col in data_df.columns:
    if data_df[col].nunique() == 1:
        data_df.drop(col, axis=1, inplace=True)
        print(f'dropped {col} column since it only has one unique value')


dropped num_outbound_cmds column since it only has one unique value


In [5]:
data_df.iloc[[0,1,2]]

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,150,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0.0,0.0,0.0,0.0,0.08,0.15,0.0,255,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,123,6,1.0,1.0,0.0,0.0,0.05,0.07,0.0,255,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune


In [15]:
!rm transformer.py
!rm ft_transformer.py
!wget https://raw.githubusercontent.com/RealDanielWei/MLModels/master/transformer.py
!wget https://raw.githubusercontent.com/RealDanielWei/MLModels/master/ft_transformer.py


--2024-09-15 21:57:48--  https://raw.githubusercontent.com/RealDanielWei/MLModels/master/transformer.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7994 (7.8K) [text/plain]
Saving to: ‘transformer.py’


2024-09-15 21:57:48 (57.3 MB/s) - ‘transformer.py’ saved [7994/7994]

--2024-09-15 21:57:48--  https://raw.githubusercontent.com/RealDanielWei/MLModels/master/ft_transformer.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8525 (8.3K) [text/plain]
Saving to: ‘ft_transformer.py’


2024-09-15 21:57:48 (63.7 MB/s) - ‘ft_transfor

In [18]:
from ft_transformer import FTTransformer

categories = [10, 20, 15]  # 3 categorical features with different numbers of categories
num_continuous = 5         # 5 continuous features
dim = 32                   # Embedding dimension
depth = 4                  # Transformer depth
heads = 4                  # Number of attention heads
dim_out = 2                # Number of outputs
batch_size = 8             # Example batch size


model = FTTransformer(
            categories=categories,
            num_continuous=num_continuous,
            dim=dim,
            depth=depth,
            heads=heads,
            dim_out=dim_out
        )

x_categ = torch.randint(0, 10, (batch_size, len(categories)))  # Categorical inputs
x_numer = torch.randn(batch_size, num_continuous)              # Continuous inputs

# Perform a forward pass through the model
output = model(x_categ, x_numer)

# Check if the output has the correct shape (batch_size, dim_out)
print("Output shape is correct." if output.shape == (batch_size, dim_out) else "Error!")


Output shape is correct.
