In [None]:
import numpy as np 
import pandas as pd 
import ipaddress # For preprocesing ip adresses
import re # regular expression library will be used to extract values within brackets
from sklearn.preprocessing import MinMaxScaler # used to normalize continuous data in physical dataset

In [7]:
df = pd.read_csv('/kaggle/input/net-data/attack_1.csv')

# I / Preprocessing 

### 1- Network Data

In [6]:
# Display features of out dataset
print(df.columns)

Index(['Time', ' mac_s', ' mac_d', ' ip_s', ' ip_d', ' sport', ' dport',
       ' proto', ' flags', ' size', ' modbus_fn', ' n_pkt_src', ' n_pkt_dst',
       ' modbus_response', ' label_n', ' label'],
      dtype='object')


In [4]:
# Display all nan values per feature
print(df.isna().sum())

Time                      0
 mac_s                    0
 mac_d                    0
 ip_s                   475
 ip_d                   475
 sport                  515
 dport                  515
 proto                    0
 flags                  515
 size                     0
 modbus_fn           153123
 n_pkt_src              475
 n_pkt_dst              475
 modbus_response    2840182
 label_n                  0
 label                    0
dtype: int64


In [6]:
# Iterate over each feature to print the number of unique values and the unique values themselves
for column in df.columns:
    unique_count = df[column].nunique(dropna=False)  # Set dropna=False to include NaN in the count if present
    unique_values = df[column].unique()
    
    print(f"Feature: {column}")
    print(f"Number of Unique Values: {unique_count}")
    print(f"Unique Values: {unique_values}\n")

Feature: Time
Number of Unique Values: 5242099
Unique Values: ['2021-04-09 18:23:28.385003' '2021-04-09 18:23:28.385005'
 '2021-04-09 18:23:28.385006' ... '2021-04-09 19:03:47.661288'
 '2021-04-09 19:03:47.661290' '2021-04-09 19:03:47.661291']

Feature:  mac_s
Number of Unique Values: 8
Unique Values: ['74:46:a0:bd:a7:1b' '0a:fe:ec:47:74:fb' 'fa:00:bc:90:d7:fa'
 'e6:3f:ac:c9:a8:8c' '00:80:f4:03:fb:12' 'fe:bb:16:7b:c3:27'
 '4a:35:83:e0:3d:a4' '00:0c:29:47:8c:22']

Feature:  mac_d
Number of Unique Values: 9
Unique Values: ['0a:fe:ec:47:74:fb' 'e6:3f:ac:c9:a8:8c' 'fa:00:bc:90:d7:fa'
 '74:46:a0:bd:a7:1b' '00:80:f4:03:fb:12' 'fe:bb:16:7b:c3:27'
 '4a:35:83:e0:3d:a4' 'ff:ff:ff:ff:ff:ff' '00:0c:29:47:8c:22']

Feature:  ip_s
Number of Unique Values: 8
Unique Values: ['84.3.251.20' '84.3.251.102' '84.3.251.103' '84.3.251.101' '84.3.251.18'
 '84.3.251.105' '84.3.251.104' nan]

Feature:  ip_d
Number of Unique Values: 8
Unique Values: ['84.3.251.102' '84.3.251.101' '84.3.251.103' '84.3.251.20' '84.

In [5]:
# Check for any -1 values across all features because later will be used to replace nan values and to reinforce the missing_pattern
print((df == -1).any().any())

# As you can see there is no single -1 value which will allow us to proceed as planned

False


In [7]:
# Clean column names by removing spaces that will later create errors in calls
df.columns = df.columns.str.replace(' ', '')

In [8]:
# set function using the ipadress library to transform ips to original decimal representation
def ip_to_int(ip):
    if pd.isna(ip):
        return -1  # in order to keep the nan values intact we replace it by -1 as we will do the same with all other nans
    return int(ipaddress.ip_address(ip))


# Apply the function to the IP address columns
df['ip_s'] = df['ip_s'].apply(ip_to_int)
df['ip_d'] = df['ip_d'].apply(ip_to_int)


In [9]:
# Factorize excluding -1 to keep our nan values remarkable as -1
def factorize_exclude_neg1(series):
    # Identify the values to be factorized (excluding -1)
    mask = series != -1
    
    # Factorize the masked values
    factorized_values, unique_values = pd.factorize(series[mask])
    
    # Create a full series with -1 preserved
    result = pd.Series(-1, index=series.index)
    result[mask] = factorized_values
    return result

# Apply the factorization while preserving -1
df['ip_s'] = factorize_exclude_neg1(df['ip_s'])
df['ip_d'] = factorize_exclude_neg1(df['ip_d'])

In [11]:
# as you can see nan values disappeared in ip_s and ip_d
print(df.isna().sum())

Time                     0
mac_s                    0
mac_d                    0
ip_s                     0
ip_d                     0
sport                  515
proto                    0
flags                  515
size                     0
modbus_fn           153123
n_pkt_src              475
n_pkt_dst              475
modbus_response    2840182
label_n                  0
label                    0
dtype: int64


In [35]:
# and here we ve got existence of -1  True since they replaced nan values in previous columns ip
print((df == -1).any().any())

True


In [10]:
# Drop the 'dport' column and keep 'sport' because of negative correlation and specialist advice
# The 'dport' and 'sport' columns are negatively correlated and often alternate values, introducing redundancy.
# retaining 'sport' because it is more valuable for detecting certain attacks
# where 'sport' might not receive a corresponding 'dport' response, making 'sport' crucial for identifying anomalies.
df = df.drop(columns=['dport'])

In [11]:
# Replace all nan values in the 'sport' column with -1
df['sport'] = df['sport'].fillna(-1)

In [15]:
# Replace all nan values in the 'flags' column with -1
df['flags'] = df['flags'].fillna(-1)

In [14]:
# Same goes for 'n_pkt_src' & 'n_pkt_dst' 
df['n_pkt_src'] = df['n_pkt_src'].fillna(-1)
df['n_pkt_dst'] = df['n_pkt_dst'].fillna(-1)

In [16]:
# Function to extract integers from brackets and also replace nan with -1 in modbus_response
def extract_int(value):
    if pd.isna(value):
        return -1  # Replace NaNs with -1
    match = re.search(r'\d+', str(value))
    if match:
        return int(match.group(0))
    return -1

# Apply the function to the 'modbus_response' column
df['modbus_response'] = df['modbus_response'].apply(extract_int)

In [17]:
# Function to factorize the mac_s and mac_d address columns and get unique values id's 
def factorize_mac_column(col):
    factorized_values, unique_values = pd.factorize(col)
    return factorized_values

# Apply the function to the MAC address columns
df['mac_s'] = factorize_mac_column(df['mac_s'])
df['mac_d'] = factorize_mac_column(df['mac_d'])

In [56]:
# as you can see now the only one remaining with nan values is modbus_fn which we will proceed next
print(df.isna().sum())

Time                    0
mac_s                   0
mac_d                   0
ip_s                    0
ip_d                    0
sport                   0
proto                   0
flags                   0
size                    0
modbus_fn          153123
n_pkt_src               0
n_pkt_dst               0
modbus_response         0
label_n                 0
label                   0
dtype: int64


In [18]:
# Replace NaNs with a placeholder value for factorization
modbus_fn_temp = df['modbus_fn'].fillna('NaN_Placeholder')

# Factorize the column
factorized_values, unique_values = pd.factorize(modbus_fn_temp)

# Replace the placeholder with -1
factorized_values = np.where(modbus_fn_temp == 'NaN_Placeholder', -1, factorized_values)

# Add the processed column back to the DataFrame
df['modbus_fn'] = factorized_values

In [19]:
# Factorize the proto column
proto_factorized, proto_unique = pd.factorize(df['proto'])

# Overwrite column like we did in all previous ones
df['proto'] = proto_factorized

In [20]:
# Factorize the size column
size_factorized, size_unique = pd.factorize(df['size'])

# Overwrite column like we did in all previous ones
df['size'] = size_factorized

In [21]:
# Factorize the flags column that contains binary representation of flags, we transform them to simple binary representation
def factorize_flags(series):
    # Identify the values to be factorized (excluding -1)
    mask = series != -1
    
    # Factorize the masked values
    factorized_values, unique_values = pd.factorize(series[mask])
    
    # Create a full series with -1 preserved
    result = pd.Series(-1, index=series.index)
    result[mask] = factorized_values
    return result
df['flags'] = factorize_flags(df['flags'])

In [22]:
# Drop Time feature since the packets are already organized by time corresponding their index 
df = df.drop(['Time'], axis=1)

In [22]:
df['sport'] = df['sport'].astype(int)

df['n_pkt_src'] = df['n_pkt_src'].astype(int)

df['n_pkt_dst'] = df['n_pkt_dst'].astype(int)

In [23]:
# Let's again check uniqueness of each feature
for column in df.columns:
    unique_count = df[column].nunique(dropna=False)  
    unique_values = df[column].unique()
    
    print(f"Feature: {column}")
    print(f"Number of Unique Values: {unique_count}")
    print(f"Unique Values: {unique_values}\n")

Feature: Time
Number of Unique Values: 5242099
Unique Values: ['2021-04-09 18:23:28.385003' '2021-04-09 18:23:28.385005'
 '2021-04-09 18:23:28.385006' ... '2021-04-09 19:03:47.661288'
 '2021-04-09 19:03:47.661290' '2021-04-09 19:03:47.661291']

Feature: mac_s
Number of Unique Values: 8
Unique Values: [0 1 2 3 4 5 6 7]

Feature: mac_d
Number of Unique Values: 9
Unique Values: [0 1 2 3 4 5 6 7 8]

Feature: ip_s
Number of Unique Values: 8
Unique Values: [ 0  1  2  3  4  5  6 -1]

Feature: ip_d
Number of Unique Values: 8
Unique Values: [ 0  1  2  3  4  5  6 -1]

Feature: sport
Number of Unique Values: 8446
Unique Values: [56667 56666 56668 ... 39589 58653 60007]

Feature: proto
Number of Unique Values: 4
Unique Values: [0 1 2 3]

Feature: flags
Number of Unique Values: 8
Unique Values: [ 0  1  2  3  4  5  6 -1]

Feature: size
Number of Unique Values: 7
Unique Values: [0 1 2 3 4 5 6]

Feature: modbus_fn
Number of Unique Values: 5
Unique Values: [ 0  1  2  3 -1]

Feature: n_pkt_src
Number of

In [24]:
# Features that contain missing values and that will be used to construct missing_pattern feature
print([col for col in df.columns if (df[col] == -1).any()])

['ip_s', 'ip_d', 'sport', 'flags', 'modbus_fn', 'n_pkt_src', 'n_pkt_dst', 'modbus_response']


In [25]:
# Function to create a binary string for the missingness pattern
def create_missing_pattern(row, columns):
    binary_string = ''.join(['1' if row[col] == -1 else '0' for col in columns])
    return int(binary_string, 2)

# list of columns with nan values
list_nan_cols = ['ip_s', 'ip_d', 'sport', 'flags', 'modbus_fn', 'n_pkt_src', 'n_pkt_dst', 'modbus_response']

# Apply the function to create the missing_pattern column
df['missing_pattern'] = df.apply(create_missing_pattern, columns=list_nan_cols, axis=1)

In [40]:
# Let's again check uniqueness of each feature
for column in df.columns:
    unique_count = df[column].nunique(dropna=False)  
    unique_values = df[column].unique()
    
    print(f"Feature: {column}")
    print(f"Number of Unique Values: {unique_count}")
    print(f"Unique Values: {unique_values}\n")
    

# Here we notice that the uniqueness of missing_pattern is only 5 which is pretty fascinating :

# Pattern: 0, Binary: 00000000, Missing Columns: (No missing values)

# Pattern: 1, Binary: 00000001, Missing Columns: (1 column missing, modbus_response)

# Pattern: 9, Binary: 00001001, Missing Columns: (2 columns missing, ip_s and modbus_response)

# Pattern: 57, Binary: 00111001, Missing Columns: (5 columns missing, ip_s, flags, modbus_fn, n_pkt_src, modbus_response)

# Pattern: 255, Binary: 11111111, Missing Columns: (All 8 columns missing)


Feature: ip_s
Number of Unique Values: 8
Unique Values: [ 0  1  2  3  4  5  6 -1]

Feature: ip_d
Number of Unique Values: 8
Unique Values: [ 0  1  2  3  4  5  6 -1]

Feature: sport
Number of Unique Values: 8446
Unique Values: [56667 56666 56668 ... 39589 58653 60007]

Feature: flags
Number of Unique Values: 8
Unique Values: [ 0  1  2  3  4  5  6 -1]

Feature: modbus_fn
Number of Unique Values: 5
Unique Values: [ 0  1  2  3 -1]

Feature: n_pkt_src
Number of Unique Values: 55
Unique Values: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 -1 52 53]

Feature: n_pkt_dst
Number of Unique Values: 54
Unique Values: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 -1]

Feature: modbus_response
Number of Unique Values: 2948
Unique Values: [  -1    0    7 ... 2176 2081 1863]



In [26]:
# We rearranged the order of features in order to give meaning to the new engineered feature 'missing_pattern'.
# This feature was created to capture patterns of NaNs in the dataset.
# By arranging the columns in a specific order, the 'missing_pattern' feature generates a binary value following that order,
# which is then converted to a decimal number, giving each pattern a unique identifier.

new_col_order= ['ip_s', 'ip_d', 'sport', 'flags', 'modbus_fn', 'n_pkt_src', 'n_pkt_dst', 'modbus_response', 'missing_pattern', 'mac_s', 'mac_d', 'proto', 'size', 'label_n', 'label']

df = df[new_col_order]

In [27]:
print(df.head())

   ip_s  ip_d  sport  flags  modbus_fn  n_pkt_src  n_pkt_dst  modbus_response  \
0     0     0  56667      0          0          0          0               -1   
1     0     1  56666      0          0          1          0               -1   
2     0     2  56668      0          0          2          0               -1   
3     1     3    502      0          1          0          0                0   
4     2     3    502      0          1          0          1                0   

   missing_pattern  mac_s  mac_d  proto  size  label_n   label  
0                1      0      0      0     0        0  normal  
1                1      0      1      0     0        0  normal  
2                1      0      2      0     0        0  normal  
3                0      1      3      0     1        0  normal  
4                0      2      3      0     1        0  normal  


In [28]:
# Save the DataFrame to a CSV file in the Kaggle working directory
df.to_csv('/kaggle/working/processed_data.csv', index=False)

### 2- Physical Data

In [32]:
# we specify utf-16 encoding and tab delimiter for correct formating of the table
df = pd.read_csv('/kaggle/input/phy-data/phy_att_1.csv', encoding='utf-16', delimiter='\t')

In [33]:
# Display features of out dataset
print(df.columns)

Index(['Time', 'Tank_1', 'Tank_2', 'Tank_3', 'Tank_4', 'Tank_5', 'Tank_6',
       'Tank_7', 'Tank_8', 'Pump_1', 'Pump_2', 'Pump_3', 'Pump_4', 'Pump_5',
       'Pump_6', 'Flow_sensor_1', 'Flow_sensor_2', 'Flow_sensor_3',
       'Flow_sensor_4', 'Valv_1', 'Valv_2', 'Valv_3', 'Valv_4', 'Valv_5',
       'Valv_6', 'Valv_7', 'Valv_8', 'Valv_9', 'Valv_10', 'Valv_11', 'Valv_12',
       'Valv_13', 'Valv_14', 'Valv_15', 'Valv_16', 'Valv_17', 'Valv_18',
       'Valv_19', 'Valv_20', 'Valv_21', 'Valv_22', 'Label_n', 'Label'],
      dtype='object')


In [13]:
# Display all nan values per feature
print(df.isna().sum())

# We notice no nan values from the plcs data

Time             0
Tank_1           0
Tank_2           0
Tank_3           0
Tank_4           0
Tank_5           0
Tank_6           0
Tank_7           0
Tank_8           0
Pump_1           0
Pump_2           0
Pump_3           0
Pump_4           0
Pump_5           0
Pump_6           0
Flow_sensor_1    0
Flow_sensor_2    0
Flow_sensor_3    0
Flow_sensor_4    0
Valv_1           0
Valv_2           0
Valv_3           0
Valv_4           0
Valv_5           0
Valv_6           0
Valv_7           0
Valv_8           0
Valv_9           0
Valv_10          0
Valv_11          0
Valv_12          0
Valv_13          0
Valv_14          0
Valv_15          0
Valv_16          0
Valv_17          0
Valv_18          0
Valv_19          0
Valv_20          0
Valv_21          0
Valv_22          0
Label_n          0
Label            0
dtype: int64


In [34]:
# Clean column names by removing spaces that will later create errors in calls
df.columns = df.columns.str.replace(' ', '')

In [35]:
# Convert boolean features to binary 
boolean_features = df.select_dtypes(include=['bool']).columns.tolist()
df[boolean_features] = df[boolean_features].astype(int)

In [45]:
print(boolean_features)
print(continuous_features)

['Pump_1', 'Pump_2', 'Pump_3', 'Pump_4', 'Pump_5', 'Pump_6', 'Valv_1', 'Valv_2', 'Valv_3', 'Valv_4', 'Valv_5', 'Valv_6', 'Valv_7', 'Valv_8', 'Valv_9', 'Valv_10', 'Valv_11', 'Valv_12', 'Valv_13', 'Valv_14', 'Valv_15', 'Valv_16', 'Valv_17', 'Valv_18', 'Valv_19', 'Valv_20', 'Valv_21', 'Valv_22']
['Tank_1', 'Tank_2', 'Tank_3', 'Tank_4', 'Tank_5', 'Tank_6', 'Tank_7', 'Tank_8', 'Flow_sensor_1', 'Flow_sensor_2', 'Flow_sensor_3', 'Flow_sensor_4']


In [36]:
# Now all our features are int except for label multiclass
print(df.dtypes)

Time             object
Tank_1            int64
Tank_2            int64
Tank_3            int64
Tank_4            int64
Tank_5            int64
Tank_6            int64
Tank_7            int64
Tank_8            int64
Pump_1            int64
Pump_2            int64
Pump_3            int64
Pump_4            int64
Pump_5            int64
Pump_6            int64
Flow_sensor_1     int64
Flow_sensor_2     int64
Flow_sensor_3     int64
Flow_sensor_4     int64
Valv_1            int64
Valv_2            int64
Valv_3            int64
Valv_4            int64
Valv_5            int64
Valv_6            int64
Valv_7            int64
Valv_8            int64
Valv_9            int64
Valv_10           int64
Valv_11           int64
Valv_12           int64
Valv_13           int64
Valv_14           int64
Valv_15           int64
Valv_16           int64
Valv_17           int64
Valv_18           int64
Valv_19           int64
Valv_20           int64
Valv_21           int64
Valv_22           int64
Label_n         

In [37]:
# Now we extract the continious features to perform normalize them

# List of all features
all_features = df.columns.tolist()

# Continuous features by excluding boolean features
continuous_features = [feature for feature in all_features if feature not in boolean_features]

# Get rid of Time and Label_n and Label
continuous_features = continuous_features[1:-2]

In [41]:
# Initialize the MinMaxScaler to normalize cont data
scaler = MinMaxScaler()

# Normalize the continuous features
df[continuous_features] = scaler.fit_transform(df[continuous_features])

In [43]:
# Let's again check uniqueness of each feature
for column in df.columns:
    unique_count = df[column].nunique(dropna=False)  
    unique_values = df[column].unique()
    
    print(f"Feature: {column}")
    print(f"Number of Unique Values: {unique_count}")
    print(f"Unique Values: {unique_values}\n")

Feature: Time
Number of Unique Values: 2420
Unique Values: ['09/04/2021 18:23:28' '09/04/2021 18:23:29' '09/04/2021 18:23:30' ...
 '09/04/2021 19:03:45' '09/04/2021 19:03:46' '09/04/2021 19:03:47']

Feature: Tank_1
Number of Unique Values: 1049
Unique Values: [0.         0.02727273 0.08181818 ... 0.09494949 0.11969697 0.16767677]

Feature: Tank_2
Number of Unique Values: 1120
Unique Values: [0.         0.00512295 0.04866803 ... 0.99282787 0.97848361 0.96465164]

Feature: Tank_3
Number of Unique Values: 1234
Unique Values: [0.         0.00525241 0.02742924 ... 0.05369128 0.07966151 0.08637292]

Feature: Tank_4
Number of Unique Values: 807
Unique Values: [0.         0.02204586 0.02821869 0.07407407 0.11111111 0.14550265
 0.17107584 0.19488536 0.22486772 0.24603175 0.27601411 0.29805996
 0.32539683 0.34567901 0.36419753 0.37477954 0.38447972 0.3994709
 0.41005291 0.42416226 0.43386243 0.44708995 0.45855379 0.47178131
 0.48148148 0.4973545  0.50705467 0.51675485 0.5308642  0.54232804
 0.55

In [46]:
# Save the DataFrame to a CSV file in the Kaggle working directory
df.to_csv('/kaggle/working/processed_phy_data.csv', index=False)

### 3- Embedding Network Categorical Data

In [50]:
import torch
import torch.nn as nn

In [47]:
df = pd.read_csv('/kaggle/input/test-0/processed_data.csv')

In [137]:
# Separate features and labels and drop multi classification column
features = df.drop(columns=['label_n','label'])
labels = df['label_n']

In [159]:
# Convert features to tensor
features_tensor = torch.tensor(features.values, dtype=torch.long)

In [139]:
# Checking if dim has been altered due to error in dimension during embedding
print(np.shape(features))
print(np.shape(features.values))
print(np.shape(features_tensor))

(5527409, 13)
(5527409, 13)
torch.Size([5527409, 13])


In [145]:
# Add 1 to columns that contain -1 to move away from negative because embedding processes positive only
list_nan_cols = ['ip_s', 'ip_d', 'sport', 'flags', 'modbus_fn', 'n_pkt_src', 'n_pkt_dst', 'modbus_response']
for column in list_nan_cols:
    if (features[column] == -1).any():
        features[column] += 1

In [147]:
# Let's again check uniqueness of each feature
for column in features.columns:
    unique_count = features[column].nunique(dropna=False)  
    unique_values = features[column].unique()
    
    print(f"Feature: {column}")
    print(f"Number of Unique Values: {unique_count}")
    print(f"Unique Values: {unique_values}\n")
    
# as we notice  -1s disappeared lettign place to 0

Feature: ip_s
Number of Unique Values: 8
Unique Values: [1 2 3 4 5 6 7 0]

Feature: ip_d
Number of Unique Values: 8
Unique Values: [1 2 3 4 5 6 7 0]

Feature: sport
Number of Unique Values: 8446
Unique Values: [56668 56667 56669 ... 39590 58654 60008]

Feature: flags
Number of Unique Values: 8
Unique Values: [1 2 3 4 5 6 7 0]

Feature: modbus_fn
Number of Unique Values: 5
Unique Values: [1 2 3 4 0]

Feature: n_pkt_src
Number of Unique Values: 55
Unique Values: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52  0 53 54]

Feature: n_pkt_dst
Number of Unique Values: 54
Unique Values: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52 53  0]

Feature: modbus_response
Number of Unique Values: 2948
Unique Values: [   0    1    8 ... 2177 2082 1864]

Feature: missing_pattern
Numb

In [149]:
# Determine the number of unique values for each feature
unique_counts = features.nunique()

# Define embedding sizes based on uniqueness or what we rather call rule of thumb
embedding_sizes = [(unique, min(50, unique // 2)) for unique in unique_counts]

In [150]:
# make sure that uniqueness list is correct and involves features only
print(unique_counts)

ip_s                  8
ip_d                  8
sport              8446
flags                 8
modbus_fn             5
n_pkt_src            55
n_pkt_dst            54
modbus_response    2948
missing_pattern       5
mac_s                 8
mac_d                 9
proto                 4
size                  7
dtype: int64


In [151]:
# Make sure the embedding mapping is correct
print(embedding_sizes)
print(len(embedding_sizes))

features['sport'].max()

[(8, 4), (8, 4), (8446, 50), (8, 4), (5, 2), (55, 27), (54, 27), (2948, 50), (5, 2), (8, 4), (9, 4), (4, 2), (7, 3)]
13


61000

In [155]:
# DONT RUN !! THIS BLOCK TRANSFORMS VALUES TO MATCH MAX TO UNIQUENESS
def factorize_sport(series):
    # Identify the values to be factorized (excluding -1)
    mask = series != -1
    
    # Factorize the masked values
    factorized_values, unique_values = pd.factorize(series[mask])
    
    # Create a full series with -1 preserved
    result = pd.Series(-1, index=series.index)
    result[mask] = factorized_values
    return result
features['sport'] = factorize_sport(features['sport'])

In [156]:
# DONT RUN !! THIS BLOCK TRANSFORMS VALUES TO MATCH MAX TO UNIQUENESS
def factorize_modbus_response(series):
    # Identify the values to be factorized (excluding -1)
    mask = series != -1
    
    # Factorize the masked values
    factorized_values, unique_values = pd.factorize(series[mask])
    
    # Create a full series with -1 preserved
    result = pd.Series(-1, index=series.index)
    result[mask] = factorized_values
    return result
features['modbus_response'] = factorize_modbus_response(features['modbus_response'])

In [157]:
# DONT RUN !! THIS BLOCK TRANSFORMS VALUES TO MATCH MAX TO UNIQUENESS
# Factorize the missing_pattern column
missing_pattern_factorized, missing_pattern_unique = pd.factorize(features['missing_pattern'])

# Overwrite column like we did in all previous ones
features['missing_pattern'] = missing_pattern_factorized

In [158]:
# DONT RUN !! THIS BLOCK TRANSFORMS VALUES TO MATCH MAX TO UNIQUENESS
print(features['sport'].max())
print(features['modbus_response'].max())
print(features['missing_pattern'].max())

# as we can see those 3 features gave us problems during embedding due to their max value surpassing

8445
2947
4


In [161]:
# Create an embedding layer for each feature and concatenate the embeddings to conserve the same order on the dataset
class EmbeddingNet(nn.Module):
    def __init__(self, embedding_sizes):
        super(EmbeddingNet, self).__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(input_dim, output_dim) for input_dim, output_dim in embedding_sizes])
    
    def forward(self, x):
        embedded_features = []
        for i, embedding in enumerate(self.embeddings):
            # Check the shape and values of x[:, i]
            print(f"Processing feature {i} with shape {x[:, i].shape} and values: {x[:, i]}")
            
            # Ensure that no value in x[:, i] exceeds the embedding's input dimension
            max_index = torch.max(x[:, i]).item()
            min_index = torch.min(x[:, i]).item()
            input_dim = embedding.num_embeddings
            print(f"Feature {i}: min index {min_index}, max index {max_index}, input_dim {input_dim}")
            if max_index >= input_dim or min_index < 0:
                raise ValueError(f"Feature {i} has an index {min_index}-{max_index} out of range for the embedding size {input_dim}")
            
            # Embed the feature
            embedded_feature = embedding(x[:, i])
            print(f"Feature {i} embedded shape: {embedded_feature.shape}")
            embedded_features.append(embedded_feature)
        
        # Concatenate all embedded features
        x = torch.cat(embedded_features, dim=1)
        return x

In [153]:
# Instantiate the model
model = EmbeddingNet(embedding_sizes)

In [162]:
# Forward pass through the model to get embeddings
with torch.no_grad():  # No need to compute gradients because we re not intersted in training
    transformed_features = model(features_tensor)

Processing feature 0 with shape torch.Size([5527409]) and values: tensor([1, 1, 1,  ..., 3, 4, 2])
Feature 0: min index 0, max index 7, input_dim 8
Feature 0 embedded shape: torch.Size([5527409, 4])
Processing feature 1 with shape torch.Size([5527409]) and values: tensor([1, 2, 3,  ..., 4, 4, 4])
Feature 1: min index 0, max index 7, input_dim 8
Feature 1 embedded shape: torch.Size([5527409, 4])
Processing feature 2 with shape torch.Size([5527409]) and values: tensor([0, 1, 2,  ..., 3, 3, 3])
Feature 2: min index 0, max index 8445, input_dim 8446
Feature 2 embedded shape: torch.Size([5527409, 50])
Processing feature 3 with shape torch.Size([5527409]) and values: tensor([1, 1, 1,  ..., 1, 1, 1])
Feature 3: min index 0, max index 7, input_dim 8
Feature 3 embedded shape: torch.Size([5527409, 4])
Processing feature 4 with shape torch.Size([5527409]) and values: tensor([1, 1, 1,  ..., 4, 2, 4])
Feature 4: min index 0, max index 4, input_dim 5
Feature 4 embedded shape: torch.Size([5527409, 2]

In [163]:
# Convert embeddings to DataFrame
transformed_df = pd.DataFrame(transformed_features.numpy())

# Add labels back to the DataFrame
transformed_df['label'] = labels.values

In [164]:
transformed_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,174,175,176,177,178,179,180,181,182,label
0,0.725355,-1.016759,1.039347,-0.047093,-1.031781,0.040913,-0.850731,0.493892,-0.517549,-0.425060,...,0.037053,-0.494992,1.473259,-0.054479,0.708037,0.815787,0.213088,-0.099283,-0.127439,0
1,0.725355,-1.016759,1.039347,-0.047093,-0.322684,-0.081810,1.107241,-1.030923,0.042941,0.252985,...,0.882487,-0.316276,0.321108,-0.759546,0.708037,0.815787,0.213088,-0.099283,-0.127439,0
2,0.725355,-1.016759,1.039347,-0.047093,0.488971,-0.382662,-0.411880,0.824436,-0.636861,-1.391213,...,-0.952098,-0.728571,-1.601683,-0.145448,0.708037,0.815787,0.213088,-0.099283,-0.127439,0
3,-0.427510,0.054935,0.412839,-0.421642,-0.041692,-0.868394,-1.143136,-0.780338,1.278776,1.601848,...,1.716619,1.076514,0.892357,0.464460,0.708037,0.815787,1.335077,0.655413,-1.171849,0
4,-0.853171,-0.009763,0.892433,0.490809,-0.041692,-0.868394,-1.143136,-0.780338,1.278776,1.601848,...,1.716619,1.076514,0.892357,0.464460,0.708037,0.815787,1.335077,0.655413,-1.171849,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5527404,0.725355,-1.016759,1.039347,-0.047093,-1.031781,0.040913,-0.850731,0.493892,-0.517549,-0.425060,...,0.037053,-0.494992,1.473259,-0.054479,0.708037,0.815787,0.213088,-0.099283,-0.127439,0
5527405,0.725355,-1.016759,1.039347,-0.047093,-1.533541,-0.062126,1.258752,0.637521,-0.132000,0.339439,...,-0.189279,-0.216524,0.350586,-0.417805,0.708037,0.815787,0.213088,-0.099283,-0.127439,0
5527406,-0.853171,-0.009763,0.892433,0.490809,-0.041692,-0.868394,-1.143136,-0.780338,1.278776,1.601848,...,1.716619,1.076514,0.892357,0.464460,0.708037,0.815787,-1.301964,0.540172,-0.531406,0
5527407,-0.788161,-0.873066,-1.887963,-0.011117,-0.041692,-0.868394,-1.143136,-0.780338,1.278776,1.601848,...,1.716619,1.076514,0.892357,0.464460,0.708037,0.815787,1.335077,0.655413,-1.171849,0


In [165]:
# Save the DataFrame to a CSV file in the Kaggle working directory
transformed_df.to_csv('/kaggle/working/embedded_net.csv', index=False)

### 4- Checking Time compatibility for merging

#### 1 st catch

In [None]:
# Convert all entries to datetime, parsing them flexibly
df['Time'] = pd.to_datetime(df['Time'], errors='coerce')

# Format all datetime objects to a uniform string format, including microseconds
df['Formatted_Time'] = df['Time'].dt.strftime('%Y-%m-%d %H:%M:%S.%f')

# If you want to truncate the microseconds where they are not originally present:
df['Formatted_Time'] = df['Formatted_Time'].apply(lambda x: x.rstrip('0').rstrip('.') if '.' in x else x)

print(df['Formatted_Time'])

In [18]:
df['Time'] = pd.to_datetime(df['Time'])  # Convert to datetime if not already
df['Time_seconds'] = df['Time'].dt.floor('S')  # Floor to nearest second

# Count unique seconds
unique_seconds_count = df['Time_seconds'].nunique()

print("There are", unique_seconds_count, "unique seconds in the Time column.")


There are 2420 unique seconds in the Time column.


  df['Time_seconds'] = df['Time'].dt.floor('S')  # Floor to nearest second


In [20]:
len(df_phy)

2420

In [23]:
print(df['Time'].iloc[0])
print(df['Time'].iloc[-1])

2021-04-09 18:23:28.385003
2021-04-09 19:03:47.661291


In [24]:
print(df_phy['Time'].iloc[0])
print(df_phy['Time'].iloc[-1])

09/04/2021 18:23:28
09/04/2021 19:03:47


#### 2 nd catch

In [60]:
df = pd.read_csv('/kaggle/input/remainder-net/attack_2.csv')

In [None]:
# Convert all entries to datetime, parsing them flexibly
df['Time'] = pd.to_datetime(df['Time'], errors='coerce')

# Format all datetime objects to a uniform string format, including microseconds
df['Formatted_Time'] = df['Time'].dt.strftime('%Y-%m-%d %H:%M:%S.%f')

# If you want to truncate the microseconds where they are not originally present:
df['Formatted_Time'] = df['Formatted_Time'].apply(lambda x: x.rstrip('0').rstrip('.') if '.' in x else x)

print(df['Formatted_Time'])

In [63]:
df['Time'] = pd.to_datetime(df['Time'])  # Convert to datetime if not already
df['Time_seconds'] = df['Time'].dt.floor('S')  # Floor to nearest second

# Count unique seconds
unique_seconds_count = df['Time_seconds'].nunique()

print("There are", unique_seconds_count, "unique seconds in the Time column.")


There are 2096 unique seconds in the Time column.


  df['Time_seconds'] = df['Time'].dt.floor('S')  # Floor to nearest second


In [71]:
df_phy = pd.read_csv('/kaggle/input/remainder-phy/phy_att_2.csv', encoding='utf-16', delimiter='\t')

In [65]:
len(df_phy)

# we notice here a  delay of 8, !!!!!! not explainable because there is only a delay of 7 rows which is explainable under because phy_df has 7 seconds before 12 and 2 second after 16 
# compared to df which gives us 7+2 = 9   then once second must be missing inside the net df find it and delete it from phy
# and thats what we will be doing al this time deleting and adjusting rows in phy to suit net before any replication

2104

In [66]:
print(df['Time'].iloc[0])
print(df['Time'].iloc[-1])

2021-04-19 15:37:19.989214
2021-04-19 16:12:14.167723


In [72]:
print(df_phy['Time'].iloc[0])
print(df_phy['Time'].iloc[-1])

19/04/2021 15:37:12
19/04/2021 16:12:16


In [73]:
df_phy = df_phy.iloc[6:-2]

In [None]:
len(df_phy) # problem solved DONE

In [74]:
print(df_phy['Time'].iloc[0])
print(df_phy['Time'].iloc[-1])

19/04/2021 15:37:19
19/04/2021 16:12:14


#### 3 rd catch

In [35]:
df = pd.read_csv('/kaggle/input/remainder-net/attack_3.csv')

In [None]:
# Convert all entries to datetime, parsing them flexibly
df['Time'] = pd.to_datetime(df['Time'], errors='coerce')

# Format all datetime objects to a uniform string format, including microseconds
df['Formatted_Time'] = df['Time'].dt.strftime('%Y-%m-%d %H:%M:%S.%f')

# If you want to truncate the microseconds where they are not originally present:
df['Formatted_Time'] = df['Formatted_Time'].apply(lambda x: x.rstrip('0').rstrip('.') if '.' in x else x)

print(df['Formatted_Time'])

In [38]:
df['Time'] = pd.to_datetime(df['Time'])  # Convert to datetime if not already
df['Time_seconds'] = df['Time'].dt.floor('S')  # Floor to nearest second

# Count unique seconds
unique_seconds_count = df['Time_seconds'].nunique()

print("There are", unique_seconds_count, "unique seconds in the Time column.")

There are 1252 unique seconds in the Time column.


  df['Time_seconds'] = df['Time'].dt.floor('S')  # Floor to nearest second


In [40]:
df_phy = pd.read_csv('/kaggle/input/remainder-phy/phy_att_3.csv', encoding='utf-16', delimiter='\t')

In [43]:
len(df_phy)

# we notice 2 delay, easy one to fix because there is only a delay of 2 rows which is explainable under because phy_df has one second before 12 and one second after 05 
# compared to df

1254

In [44]:
print(df['Time'].iloc[0])
print(df['Time'].iloc[-1])

2021-04-09 19:42:13.484804
2021-04-09 20:03:04.790765


In [45]:
print(df_phy['Time'].iloc[0])
print(df_phy['Time'].iloc[-1])

09/04/2021 19:42:12
09/04/2021 20:03:05


#### 4 th catch

In [46]:
df = pd.read_csv('/kaggle/input/remainder-net/attack_4.csv')

In [None]:
# Convert all entries to datetime, parsing them flexibly
df['Time'] = pd.to_datetime(df['Time'], errors='coerce')

# Format all datetime objects to a uniform string format, including microseconds
df['Formatted_Time'] = df['Time'].dt.strftime('%Y-%m-%d %H:%M:%S.%f')

# If you want to truncate the microseconds where they are not originally present:
df['Formatted_Time'] = df['Formatted_Time'].apply(lambda x: x.rstrip('0').rstrip('.') if '.' in x else x)

print(df['Formatted_Time'])

In [49]:
df['Time'] = pd.to_datetime(df['Time'])  # Convert to datetime if not already
df['Time_seconds'] = df['Time'].dt.floor('S')  # Floor to nearest second

# Count unique seconds
unique_seconds_count = df['Time_seconds'].nunique()

print("There are", unique_seconds_count, "unique seconds in the Time column.")

There are 1711 unique seconds in the Time column.


  df['Time_seconds'] = df['Time'].dt.floor('S')  # Floor to nearest second


In [52]:
df_phy = pd.read_csv('/kaggle/input/remainder-phy/phy_att_4.csv')

In [54]:
len(df_phy)

# we notice here a  delay of 6, explainable because there is only a delay of 6 rows which is explainable under because phy_df has 7 seconds before 18 and 1 second before 54 
# compared to df which gives us 7-1 = 6

1717

In [55]:
print(df['Time'].iloc[0])
print(df['Time'].iloc[-1])

2022-02-21 14:45:25.454111
2022-02-21 15:13:55.070978


In [56]:
print(df_phy['Time'].iloc[0])
print(df_phy['Time'].iloc[-1])

21/02/2022 14:45:18
21/02/2022 15:13:54


# II / Model

### 1- First Test Vanilla NN

In [2]:
df = transformed_df

NameError: name 'transformed_df' is not defined

In [167]:
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [168]:
# Function to split
def preprocess_data(df):
    # Separate features and labels
    X = df.drop(columns=['label']).values
    y = df['label'].values
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test


In [169]:
# Create tensors to feed model
def create_tensors(X_train, X_test, y_train, y_test):
    # Convert to PyTorch tensors
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.long)
    y_test_tensor = torch.tensor(y_test, dtype=torch.long)
    
    return X_train_tensor, X_test_tensor, y_train_tensor, y_test_tensor

In [170]:
# Basic vanilla Neural Network
class DeeperNN(nn.Module):
    def __init__(self, input_dim):
        super(DeeperNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.fc4 = nn.Linear(128, 64)
        self.bn4 = nn.BatchNorm1d(64)
        self.fc5 = nn.Linear(64, 2)  # for binary classification
        self.dropout = nn.Dropout(p=0.5)

    def forward(self, x):
        x = torch.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = torch.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = torch.relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        x = torch.relu(self.bn4(self.fc4(x)))
        x = self.fc5(x)
        return x

In [171]:
# Classical training function using Cross entropy loss and adam
def train_model(model, X_train_tensor, y_train_tensor, num_epochs=20, learning_rate=0.001):
    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # Training loop
    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

In [172]:
# Eval function
def evaluate_model(model, X_test_tensor, y_test_tensor):
    # Evaluate the model
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = (predicted == y_test_tensor).sum().item() / y_test_tensor.size(0)
    return accuracy

In [1]:
# Preprocess data
X_train, X_test, y_train, y_test = preprocess_data(df)

# Create tensors
X_train_tensor, X_test_tensor, y_train_tensor, y_test_tensor = create_tensors(X_train, X_test, y_train, y_test)

# Define and train the model
input_dim = X_train_tensor.shape[1]
model = DeeperNN(input_dim)
train_model(model, X_train_tensor, y_train_tensor)

# Evaluate the model
accuracy = evaluate_model(model, X_test_tensor, y_test_tensor)
print(f'Accuracy on test set: {accuracy * 100:.2f}%')

NameError: name 'preprocess_data' is not defined