In [1]:
import pandas as pd
import numpy as np


In [2]:
# Merging Train and Test dataset for data pre-processing
train = pd.read_csv("UNSW_NB15_training-set.csv")
test=pd.read_csv("UNSW_NB15_testing-set.csv")
data = pd.concat([train, test], axis=0)

##### Sample Merged Dataset

In [3]:
data.head

<bound method NDFrame.head of             id       dur proto service state  spkts  dpkts  sbytes  dbytes  \
0            1  0.000011   udp       -   INT      2      0     496       0   
1            2  0.000008   udp       -   INT      2      0    1762       0   
2            3  0.000005   udp       -   INT      2      0    1068       0   
3            4  0.000006   udp       -   INT      2      0     900       0   
4            5  0.000010   udp       -   INT      2      0    2126       0   
...        ...       ...   ...     ...   ...    ...    ...     ...     ...   
175336  175337  0.000009   udp     dns   INT      2      0     114       0   
175337  175338  0.505762   tcp       -   FIN     10      8     620     354   
175338  175339  0.000009   udp     dns   INT      2      0     114       0   
175339  175340  0.000009   udp     dns   INT      2      0     114       0   
175340  175341  0.000009   udp     dns   INT      2      0     114       0   

                 rate  ...  ct_ds

##### Number of Rows and Columns of Merged Dataset

In [4]:

data.shape

(257673, 45)

##### Numerical Features of UNSW-NB15 

In [5]:
numerical_features = data.select_dtypes(include='number').columns.tolist()
numerical_features

['id',
 'dur',
 'spkts',
 'dpkts',
 'sbytes',
 'dbytes',
 'rate',
 'sttl',
 'dttl',
 'sload',
 'dload',
 'sloss',
 'dloss',
 'sinpkt',
 'dinpkt',
 'sjit',
 'djit',
 'swin',
 'stcpb',
 'dtcpb',
 'dwin',
 'tcprtt',
 'synack',
 'ackdat',
 'smean',
 'dmean',
 'trans_depth',
 'response_body_len',
 'ct_srv_src',
 'ct_state_ttl',
 'ct_dst_ltm',
 'ct_src_dport_ltm',
 'ct_dst_sport_ltm',
 'ct_dst_src_ltm',
 'is_ftp_login',
 'ct_ftp_cmd',
 'ct_flw_http_mthd',
 'ct_src_ltm',
 'ct_srv_dst',
 'is_sm_ips_ports',
 'label']

In [6]:
print("Total number of numerical features", len(numerical_features))

Total number of numerical features 41


##### Categorical Features of UNSW-NB15

In [7]:
categorical_features = data.select_dtypes(exclude='number').columns.tolist()
categorical_features

['proto', 'service', 'state', 'attack_cat']

In [8]:
print("Total number of categorical features",len(categorical_features))

Total number of categorical features 4


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 257673 entries, 0 to 175340
Data columns (total 45 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 257673 non-null  int64  
 1   dur                257673 non-null  float64
 2   proto              257673 non-null  object 
 3   service            257673 non-null  object 
 4   state              257673 non-null  object 
 5   spkts              257673 non-null  int64  
 6   dpkts              257673 non-null  int64  
 7   sbytes             257673 non-null  int64  
 8   dbytes             257673 non-null  int64  
 9   rate               257673 non-null  float64
 10  sttl               257673 non-null  int64  
 11  dttl               257673 non-null  int64  
 12  sload              257673 non-null  float64
 13  dload              257673 non-null  float64
 14  sloss              257673 non-null  int64  
 15  dloss              257673 non-null  int64  
 16  si

##### It is clear that we dont need the id, so we may drop it

In [10]:
data = data.drop(["id"], axis=1)

##### Check for any missing values

In [11]:
data.isnull().values.any()

False

##### We can't deal with categorical columns having value '-'.
##### So Replace it with 'None'

In [12]:
categorical_cols = train.select_dtypes(include=["object"]).columns
data[categorical_cols] = data[categorical_cols].replace('-', "None")

##### Different services from which the network traffic has been measured

In [13]:
data['service'].value_counts()

None        141321
dns          68661
http         27011
smtp          6909
ftp-data      5391
ftp           4980
pop3          1528
ssh           1506
dhcp           120
snmp           109
ssl             86
irc             30
radius          21
Name: service, dtype: int64

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 257673 entries, 0 to 175340
Data columns (total 44 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   dur                257673 non-null  float64
 1   proto              257673 non-null  object 
 2   service            257673 non-null  object 
 3   state              257673 non-null  object 
 4   spkts              257673 non-null  int64  
 5   dpkts              257673 non-null  int64  
 6   sbytes             257673 non-null  int64  
 7   dbytes             257673 non-null  int64  
 8   rate               257673 non-null  float64
 9   sttl               257673 non-null  int64  
 10  dttl               257673 non-null  int64  
 11  sload              257673 non-null  float64
 12  dload              257673 non-null  float64
 13  sloss              257673 non-null  int64  
 14  dloss              257673 non-null  int64  
 15  sinpkt             257673 non-null  float64
 16  di

In [15]:
data.shape

(257673, 44)

In [16]:
label_counts = data['attack_cat'].value_counts()
print("Distinct label names and their counts:")
print(label_counts)

Distinct label names and their counts:
Normal            93000
Generic           58871
Exploits          44525
Fuzzers           24246
DoS               16353
Reconnaissance    13987
Analysis           2677
Backdoor           2329
Shellcode          1511
Worms               174
Name: attack_cat, dtype: int64


##### Encode the Categorical features

In [17]:

from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Apply label encoding to categorical columns
for col in ['proto', 'service', 'state', 'attack_cat']:
    data[col] = label_encoder.fit_transform(data[col])

# Display the first few rows of the encoded DataFrame
print(data.head())

        dur  proto  service  state  spkts  dpkts  sbytes  dbytes         rate  \
0  0.000011    119        0      5      2      0     496       0   90909.0902   
1  0.000008    119        0      5      2      0    1762       0  125000.0003   
2  0.000005    119        0      5      2      0    1068       0  200000.0051   
3  0.000006    119        0      5      2      0     900       0  166666.6608   
4  0.000010    119        0      5      2      0    2126       0  100000.0025   

   sttl  ...  ct_dst_sport_ltm  ct_dst_src_ltm  is_ftp_login  ct_ftp_cmd  \
0   254  ...                 1               2             0           0   
1   254  ...                 1               2             0           0   
2   254  ...                 1               3             0           0   
3   254  ...                 1               3             0           0   
4   254  ...                 1               3             0           0   

   ct_flw_http_mthd  ct_src_ltm  ct_srv_dst  is_sm_ips_p

##### Display the data frame

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 257673 entries, 0 to 175340
Data columns (total 44 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   dur                257673 non-null  float64
 1   proto              257673 non-null  int32  
 2   service            257673 non-null  int32  
 3   state              257673 non-null  int32  
 4   spkts              257673 non-null  int64  
 5   dpkts              257673 non-null  int64  
 6   sbytes             257673 non-null  int64  
 7   dbytes             257673 non-null  int64  
 8   rate               257673 non-null  float64
 9   sttl               257673 non-null  int64  
 10  dttl               257673 non-null  int64  
 11  sload              257673 non-null  float64
 12  dload              257673 non-null  float64
 13  sloss              257673 non-null  int64  
 14  dloss              257673 non-null  int64  
 15  sinpkt             257673 non-null  float64
 16  di

In [19]:
data.shape

(257673, 44)

##### Check if there is any NaN in the dataframe

In [20]:
data= data.dropna()

In [21]:
data.shape

(257673, 44)

In [22]:
data.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1.1e-05,119,0,5,2,0,496,0,90909.0902,254,...,1,2,0,0,0,1,2,0,6,0
1,8e-06,119,0,5,2,0,1762,0,125000.0003,254,...,1,2,0,0,0,1,2,0,6,0
2,5e-06,119,0,5,2,0,1068,0,200000.0051,254,...,1,3,0,0,0,1,3,0,6,0
3,6e-06,119,0,5,2,0,900,0,166666.6608,254,...,1,3,0,0,0,2,3,0,6,0
4,1e-05,119,0,5,2,0,2126,0,100000.0025,254,...,1,3,0,0,0,2,3,0,6,0


In [23]:
label_counts = data['attack_cat'].value_counts()
print("Distinct label names and their counts:")
print(label_counts)

Distinct label names and their counts:
6    93000
5    58871
3    44525
4    24246
2    16353
7    13987
0     2677
1     2329
8     1511
9      174
Name: attack_cat, dtype: int64


##### Normalize the data

In [24]:
from sklearn.preprocessing import MinMaxScaler
# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Normalize the dataframe
normalized_df = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

print("Normalized DataFrame using Min-Max Scaling:")
print(normalized_df)

Normalized DataFrame using Min-Max Scaling:
                 dur     proto   service  state     spkts     dpkts    sbytes  \
0       1.833334e-07  0.901515  0.000000    0.5  0.000094  0.000000  0.000033   
1       1.333334e-07  0.901515  0.000000    0.5  0.000094  0.000000  0.000121   
2       8.333335e-08  0.901515  0.000000    0.5  0.000094  0.000000  0.000073   
3       1.000000e-07  0.901515  0.000000    0.5  0.000094  0.000000  0.000061   
4       1.666667e-07  0.901515  0.000000    0.5  0.000094  0.000000  0.000146   
...              ...       ...       ...    ...       ...       ...       ...   
257668  1.500000e-07  0.901515  0.166667    0.5  0.000094  0.000000  0.000006   
257669  8.429368e-03  0.856061  0.000000    0.4  0.000845  0.000726  0.000042   
257670  1.500000e-07  0.901515  0.166667    0.5  0.000094  0.000000  0.000006   
257671  1.500000e-07  0.901515  0.166667    0.5  0.000094  0.000000  0.000006   
257672  1.500000e-07  0.901515  0.166667    0.5  0.000094  0.0000

In [25]:
label_counts = normalized_df['attack_cat'].value_counts()
print("Distinct label names and their counts:")
print(label_counts)

Distinct label names and their counts:
0.666667    93000
0.555556    58871
0.333333    44525
0.444444    24246
0.222222    16353
0.777778    13987
0.000000     2677
0.111111     2329
0.888889     1511
1.000000      174
Name: attack_cat, dtype: int64


In [48]:
# Selecting rows with specified labels
selected_labels = [1.000000]

# Check the precision of the labels in the DataFrame

selected_rows = normalized_df[normalized_df['attack_cat'].isin(selected_labels)]

In [49]:
print(selected_rows.shape)

(174, 44)


In [50]:
X = selected_rows.iloc[:, :-1].values  # Features (excluding the label column)
y = selected_rows.iloc[:, -1].values   # Labels

In [51]:
import warnings
warnings.filterwarnings("ignore")

In [52]:
import tensorflow as tf
from tensorflow.keras import layers, Model
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split

In [63]:
import tensorflow as tf
from tensorflow.keras import layers, Model
import numpy as np

# Define WGAN-GP components
def make_generator_model(input_dim, output_dim):
    model = tf.keras.Sequential()
    model.add(layers.Dense(128, use_bias=False, input_shape=(input_dim,)))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    model.add(layers.Dense(256, use_bias=False))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    model.add(layers.Dense(output_dim, activation='tanh'))
    return model

def make_discriminator_model(input_dim):
    model = tf.keras.Sequential()
    model.add(layers.Dense(256, input_shape=(input_dim,)))
    model.add(layers.LeakyReLU())
    model.add(layers.Dense(128))
    model.add(layers.LeakyReLU())
    model.add(layers.Dense(1))
    return model

def discriminator_loss(real_output, fake_output):
    real_loss = tf.reduce_mean(real_output)
    fake_loss = tf.reduce_mean(fake_output)
    return fake_loss - real_loss

def generator_loss(fake_output):
    return -tf.reduce_mean(fake_output)

def gradient_penalty(discriminator, real_data, fake_data):
    batch_size = real_data.shape[0]
    alpha = tf.random.uniform([batch_size, 1], 0.0, 1.0)
    real_data = tf.cast(real_data, tf.float32)
    fake_data = tf.cast(fake_data, tf.float32)
    diff = fake_data - real_data
    interpolated = real_data + alpha * diff
    with tf.GradientTape() as gp_tape:
        gp_tape.watch(interpolated)
        pred = discriminator(interpolated)
    grads = gp_tape.gradient(pred, [interpolated])[0]
    norm = tf.sqrt(tf.reduce_sum(tf.square(grads), axis=1))
    gp = tf.reduce_mean((norm - 1.0) ** 2)
    return gp

@tf.function
def train_step(real_data, generator, discriminator, generator_optimizer, discriminator_optimizer, batch_size, z_dim, gp_weight):
    noise = tf.random.normal([batch_size, z_dim])
    real_data = tf.cast(real_data, tf.float32)

    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        fake_data = generator(noise, training=True)
        real_output = discriminator(real_data, training=True)
        fake_output = discriminator(fake_data, training=True)

        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(real_output, fake_output)
        gp = gradient_penalty(discriminator, real_data, fake_data)
        total_disc_loss = disc_loss + gp * gp_weight

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(total_disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))

# WGAN-GP training function
def train_wgan_gp(X, epochs, batch_size, z_dim, gp_weight, generator, discriminator, generator_optimizer, discriminator_optimizer):
    dataset = tf.data.Dataset.from_tensor_slices(tf.cast(X, tf.float32)).shuffle(buffer_size=1024).batch(batch_size)

    for epoch in range(epochs):
        for real_data in dataset:
            if real_data.shape[0] != batch_size:
                continue  # Skip the batch if it does not match the required batch size
            train_step(real_data, generator, discriminator, generator_optimizer, discriminator_optimizer, batch_size, z_dim, gp_weight)

    noise = tf.random.normal([X.shape[0], z_dim])
    synthetic_data = generator(noise, training=False)
    return synthetic_data

# Set WGAN-GP parameters
z_dim = 10
gp_weight = 10.0
batch_size = 1
epochs = 20  # Adjust as needed

# Create WGAN-GP models and optimizers
generator = make_generator_model(z_dim, X.shape[1])
discriminator = make_discriminator_model(X.shape[1])
generator_optimizer = tf.keras.optimizers.Adam(1e-4)
discriminator_optimizer = tf.keras.optimizers.Adam(1e-4)

# Train WGAN-GP and generate synthetic data
synthetic_X = train_wgan_gp(X, epochs, batch_size, z_dim, gp_weight, generator, discriminator, generator_optimizer, discriminator_optimizer)
synthetic_y = np.random.choice(y, size=synthetic_X.shape[0])  # Assuming same distribution for labels

# Combine real and synthetic data
X_combined = np.vstack((X, synthetic_X))
y_combined = np.hstack((y, synthetic_y))


In [64]:
X_combined.shape

(2784, 43)

In [65]:
# Combine into a single DataFrame
df_combined = pd.DataFrame(X_combined)
df_combined['Label'] = y_combined

In [77]:
df_combined.columns=normalized_df.columns

In [81]:
df_filtered = normalized_df[normalized_df['attack_cat'] != 1.000000]

In [82]:
df_filtered.shape

(257499, 44)

In [83]:
df_new=pd.concat([df_filtered,df_combined],axis=0)

In [84]:
df_new.shape

(260283, 44)

In [85]:
X = df_new.iloc[:, :-1].values  # Features (excluding the label column)
y = df_new.iloc[:, -1].values   # Labels

In [88]:
# Indian Millipede Optimization Algorithm(IMOA) based feature selection
# Define the fitness function for feature selection based on mutual information
def fitness_function(mask):
    selected_features = np.where(mask == 1)[0]
    if len(selected_features) == 0:
        return 0  # No features selected, so fitness is zero
    
    X_selected = X_combined[:, selected_features]
    mi = mutual_info_classif(X_selected, y_combined, discrete_features='auto')
    return np.sum(mi)  # Maximize the sum of mutual information of selected features

# Define the constraint violation function (if any)
def constraint_violation(mask):
    return 0  # No constraints in this case

# Initialize parameters
N = 20  # Population size
d = X.shape[1]  # Dimensionality of the problem (number of features)
T = 100  # Maximum iterations
T_th = 0.5  # Temperature threshold
α = 0.1  # Seasonal activity factor
β = 0.5  # Reversal factor
γ = 0.1  # Learning rate
δ = 0.01  # Step size
ε = 0.1  # Social factor
λ = 10  # Penalty coefficient
η = 0.5  # Crossover coefficient
convergence_threshold = 1e-6  # Convergence threshold for fitness improvement
no_improvement_limit = 10  # Number of iterations to wait for improvement before stopping

# Initialize population
P = np.random.randint(0, 2, (N, d))  # Binary mask for feature selection

# Evaluate initial fitness
fitness = np.array([fitness_function(mask) for mask in P])
penalized_fitness = fitness - λ * np.array([constraint_violation(mask) for mask in P])

# Main IMOA loop
t = 0
no_improvement_count = 0
best_fitness = np.max(penalized_fitness)
best_solution = P[np.argmax(penalized_fitness)]

while t < T and no_improvement_count < no_improvement_limit:
    for i in range(N):
        # Seasonal Abundance
        α_t = α * np.sin(2 * np.pi * t / T)
        P[i] = np.clip(P[i] + α_t * np.random.uniform(-1, 1, d), 0, 1).astype(int)
        
        # Obstacle Avoidance
        if penalized_fitness[i] < np.mean(penalized_fitness):  # Assuming poor fitness as below mean fitness
            P[i] = np.clip(P[i] - β * np.random.uniform(-1, 1, d), 0, 1).astype(int)
        
        # Temperature Response
        if np.random.uniform(0, 1) > T_th:
            best_idx = np.argmax(penalized_fitness)
            P[i] = np.clip(P[i] + γ * (P[best_idx] - P[i]), 0, 1).astype(int)
        
        # Resource Utilization
        gradient = (fitness_function(P[i] + δ) - fitness_function(P[i])) / δ
        P[i] = np.clip(P[i] + δ * gradient, 0, 1).astype(int)
        
        # Group Movement
        P[i] = np.clip(P[i] + ε * (np.mean(P, axis=0) - P[i]), 0, 1).astype(int)
        
        # Defensive Behavior
        if penalized_fitness[i] < np.mean(penalized_fitness):  # Assuming poor fitness as below mean fitness
            penalized_fitness[i] -= λ * constraint_violation(P[i])
        
        # Mating Behavior
        mate_idx = np.random.randint(N)
        P[i] = np.clip(η * P[i] + (1 - η) * P[mate_idx], 0, 1).astype(int)
        
        # Predator Avoidance
        if np.std(P) < 1e-6:  # Assuming low diversity as low standard deviation
            P[i] = np.random.randint(0, 2, d)
    
    # Evaluate fitness of new positions
    fitness = np.array([fitness_function(mask) for mask in P])
    penalized_fitness = fitness - λ * np.array([constraint_violation(mask) for mask in P])
    
    # Check for convergence
    current_best_fitness = np.max(penalized_fitness)
    if current_best_fitness > best_fitness + convergence_threshold:
        best_fitness = current_best_fitness
        best_solution = P[np.argmax(penalized_fitness)]
        no_improvement_count = 0
    else:
        no_improvement_count += 1
    
    # Increment iteration counter
    t += 1
selected_features = np.where(best_solution == 1 )[0]

In [89]:
# Print the selected features

print("Selected Features:", selected_features)

Selected Features: [ 0  1  2  3  6  7 12 13 15 17 19 20 21 22 24 25 27 28 31 35 37 39]


In [90]:
num_entries = len( selected_features)
num_entries

22

In [91]:
for feature_index in selected_features:
    print(data.columns[feature_index])


dur
proto
service
state
sbytes
dbytes
dload
sloss
sinpkt
sjit
swin
stcpb
dtcpb
dwin
synack
ackdat
dmean
trans_depth
ct_state_ttl
ct_dst_src_ltm
ct_ftp_cmd
ct_src_ltm


In [93]:
# Specify the filename
filename = 'UNSW_merged_filtered_data.csv'
filename1 = 'UNSW_augmented_filtered_data.csv'
filename2='UNSW_filtered_data.csv'
# Save the DataFrame to a CSV file
df_new.to_csv(filename, index=False)
normalized_df.to_csv(filename1, index=False)
df_combined.to_csv(filename2, index=False)
print(f"DataFrame saved to {filename}")

DataFrame saved to UNSW_merged_filtered_data.csv
