In [2]:
import pandas as pd
import hashlib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder


def hash_ip(ip_address):
    """
    Hashes an IP address using SHA-256 and returns a hashed integer.

    Args:
        ip_address: The IP address string.

    Returns:
        Hashed integer value.
    """
    return int(hashlib.sha256(ip_address.encode('utf-8')).hexdigest(), 16) % 1000

def hash_protocol(protocol):
    """
    Hashes a protocol value using SHA-256 and returns an integer hash.
    
    Args:
        protocol (int or str): Protocol value to hash.
        
    Returns:
        int: Hashed integer value.
    """
    # Convert the protocol to a string if it's not already
    protocol_str = str(protocol)
    return int(hashlib.sha256(protocol_str.encode('utf-8')).hexdigest(), 16) % 100

In [3]:

# Load the dataset
df = pd.read_csv("../data/dataset/NF-UQ-NIDS.csv") 


In [4]:

# Hash IP addresses
df['src_ip_hash'] = df['IPV4_SRC_ADDR'].apply(hash_ip)
df['dst_ip_hash'] = df['IPV4_DST_ADDR'].apply(hash_ip)

# Hash the protocol
df['protocol_hash'] = df['PROTOCOL'].apply(hash_protocol) 

# One-hot encode the 'PROTOCOL' column (optional, for comparison)
# df = pd.get_dummies(df, columns=['PROTOCOL'])

df = df.drop(columns=['IPV4_SRC_ADDR', 'IPV4_DST_ADDR','PROTOCOL','Dataset','Label'], axis=1) 


In [5]:


# Select numerical features for scaling
numerical_cols = ['L4_SRC_PORT', 'L4_DST_PORT', 'IN_BYTES', 'OUT_BYTES', 'IN_PKTS', 'OUT_PKTS', 'FLOW_DURATION_MILLISECONDS', 
                  'src_ip_hash', 'dst_ip_hash', 'protocol_hash'] 

# Standardize numerical features
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


# Encode attack labels
label_encoder = LabelEncoder()

df['Attack_Encoded'] = label_encoder.fit_transform(df['Attack']) 




In [7]:

# Split data into features (X) and labels (y)
X = df[['src_ip_hash', 'dst_ip_hash', 'protocol_hash','L4_SRC_PORT', 'L4_DST_PORT', 'IN_BYTES', 'OUT_BYTES', 'IN_PKTS', 'OUT_PKTS', 'FLOW_DURATION_MILLISECONDS']]  # Features
y = df['Attack_Encoded']  # Use the encoded 'Attack' column as the label 


# Split data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42) 

# Print the shapes of the resulting sets
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (2398978, 10)
y_train shape: (2398978,)
X_test shape: (9595915, 10)
y_test shape: (9595915,)


In [8]:
X_train.head()

Unnamed: 0,src_ip_hash,dst_ip_hash,protocol_hash,L4_SRC_PORT,L4_DST_PORT,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,FLOW_DURATION_MILLISECONDS
9182808,1.726911,-1.158015,1.775631,0.711677,-0.502898,-0.023539,-0.032052,-0.018205,-0.022178,0.99488
6085778,0.782005,-0.441437,-0.551084,0.129505,-0.314644,-0.012686,-0.025561,-0.011896,-0.004846,0.992958
6944984,-1.143066,-1.158015,1.775631,0.655717,-0.502898,-0.023581,-0.032075,-0.018205,-0.022178,0.99488
61983,0.457634,-1.211342,-0.551084,0.729366,1.171065,-0.021415,-0.002268,-0.014525,-0.000995,-1.009661
9165398,-0.166428,-1.158015,1.775631,0.713953,-0.502898,-0.023527,-0.032045,-0.018205,-0.022178,0.99488


In [12]:
# Save the preprocessed data to a new CSV file (optional)
df.to_csv("preprocessed_data_with_hashed_protocol_.csv", index=False) 

In [14]:
df.head()

Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Attack,src_ip_hash,dst_ip_hash,protocol_hash,Attack_Encoded
0,0.98773,2.658863,0.0,0.03422,-0.03093,-0.012947,-0.008698,25,-1.009758,Benign,0.306026,1.928271,-0.551084,2
1,-0.424972,-0.419775,0.0,-0.013282,-0.031994,-0.015576,-0.020252,25,-1.009765,Benign,0.150892,1.928271,-0.551084,2
2,-1.955001,0.733952,1.0,-0.012884,-0.028135,-0.005061,0.018263,25,-1.009246,Benign,0.306026,0.688424,-0.551084,2
3,-0.827314,2.140321,0.0,-0.020789,-0.002268,-0.013473,-0.000995,27,-1.009707,Benign,0.309551,0.685091,-0.551084,2
4,1.034632,-0.504703,1.0,-0.013221,-0.024371,-0.001906,0.041372,25,-1.009084,Benign,1.621138,0.538442,-0.551084,2


In [9]:

# Encode attack labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Get the mapping of attack labels to encoded values
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_mapping)

Label Mapping: {np.int64(0): np.int64(0), np.int64(1): np.int64(1), np.int64(2): np.int64(2), np.int64(3): np.int64(3), np.int64(4): np.int64(4), np.int64(5): np.int64(5), np.int64(6): np.int64(6), np.int64(7): np.int64(7), np.int64(8): np.int64(8), np.int64(9): np.int64(9), np.int64(10): np.int64(10), np.int64(11): np.int64(11), np.int64(12): np.int64(12), np.int64(13): np.int64(13), np.int64(14): np.int64(14), np.int64(15): np.int64(15), np.int64(16): np.int64(16), np.int64(17): np.int64(17), np.int64(18): np.int64(18), np.int64(19): np.int64(19), np.int64(20): np.int64(20)}


random forest ids 

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# Create a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42) 

# Train the model
rf_model.fit(X_train, y_train)


KeyboardInterrupt: 

In [None]:

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

# Print classification report
print(classification_report(y_test, y_pred))

# Print confusion matrix
print(confusion_matrix(y_test, y_pred))

starting from here gan will be trained 

In [1]:
import pandas as pd
import hashlib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder


def hash_ip(ip_address):
    """
    Hashes an IP address using SHA-256 and returns a hashed integer.

    Args:
        ip_address: The IP address string.

    Returns:
        Hashed integer value.
    """
    return int(hashlib.sha256(ip_address.encode('utf-8')).hexdigest(), 16) % 1000

def hash_protocol(protocol):
    """
    Hashes a protocol value using SHA-256 and returns an integer hash.
    
    Args:
        protocol (int or str): Protocol value to hash.
        
    Returns:
        int: Hashed integer value.
    """
    # Convert the protocol to a string if it's not already
    protocol_str = str(protocol)
    return int(hashlib.sha256(protocol_str.encode('utf-8')).hexdigest(), 16) % 100

In [3]:

# Load the dataset
df = pd.read_csv("data.csv") 


In [4]:

# Hash IP addresses
df['src_ip_hash'] = df['IPV4_SRC_ADDR'].apply(hash_ip)
df['dst_ip_hash'] = df['IPV4_DST_ADDR'].apply(hash_ip)

# Hash the protocol
df['protocol_hash'] = df['PROTOCOL'].apply(hash_protocol) 

# One-hot encode the 'PROTOCOL' column (optional, for comparison)
# df = pd.get_dummies(df, columns=['PROTOCOL'])

df = df.drop(columns=['IPV4_SRC_ADDR', 'IPV4_DST_ADDR','PROTOCOL','Label'], axis=1) 


In [5]:


# Select numerical features for scaling
numerical_cols = ['L4_SRC_PORT', 'L4_DST_PORT', 'IN_BYTES', 'OUT_BYTES', 'IN_PKTS', 'OUT_PKTS', 'FLOW_DURATION_MILLISECONDS', 
                  'src_ip_hash', 'dst_ip_hash', 'protocol_hash'] 

# Standardize numerical features
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


# Encode attack labels
label_encoder = LabelEncoder()

df['Attack_Encoded'] = label_encoder.fit_transform(df['Attack']) 




In [6]:

# Split data into features (X) and labels (y)
X = df[['src_ip_hash', 'dst_ip_hash', 'protocol_hash','L4_SRC_PORT', 'L4_DST_PORT', 'IN_BYTES', 'OUT_BYTES', 'IN_PKTS', 'OUT_PKTS', 'FLOW_DURATION_MILLISECONDS']]  # Features
y = df['Attack_Encoded']  # Use the encoded 'Attack' column as the label 


# Split data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

# Print the shapes of the resulting sets
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (480080, 10)
y_train shape: (480080,)
X_test shape: (120020, 10)
y_test shape: (120020,)


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# Create a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42) 

# Train the model
rf_model.fit(X_train, y_train)


In [8]:

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

# Print classification report
print(classification_report(y_test, y_pred))

# Print confusion matrix
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.7591484752541243
Precision: 0.7588288652064555
Recall: 0.7591484752541243
F1-score: 0.758985404452706
              precision    recall  f1-score   support

           0       0.83      0.83      0.83      2725
           1       0.05      0.05      0.05     11381
           2       0.05      0.05      0.05     11365
           3       0.93      0.93      0.93     94179
           4       0.90      0.85      0.87       370

    accuracy                           0.76    120020
   macro avg       0.55      0.54      0.55    120020
weighted avg       0.76      0.76      0.76    120020

[[ 2274     1     2   448     0]
 [    2   568  7645  3155    11]
 [    2  7522   591  3238    12]
 [  463  3144  3194 87367    11]
 [    2    13    21    21   313]]


In [9]:
# prompt: save the trained model 

import joblib

# Save the trained model to a file
joblib.dump(rf_model, 'random_forest_ids.joblib')


['random_forest_ids.joblib']

In [10]:
from sklearn.preprocessing import OneHotEncoder

# Create One-Hot Encoder
one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')  

# Fit and transform training labels
y_train_one_hot = one_hot_encoder.fit_transform(y_train.values.reshape(-1, 1)) 

# Transform test labels
y_test_one_hot = one_hot_encoder.transform(y_test.values.reshape(-1, 1))

In [12]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Concatenate
from tensorflow.keras.models import Model

input_dim = X_train.shape[1] 
latent_dim = 100 
num_classes = y_train_one_hot.shape[1] 

def build_generator(latent_dim, num_classes):
    noise = Input(shape=(latent_dim,))
    labels = Input(shape=(num_classes,)) 
    x = Concatenate()([noise, labels])
    x = Dense(256, activation='relu')(x)
    x = Dense(512, activation='relu')(x)
    x = Dense(1024, activation='relu')(x)
    output = Dense(input_dim, activation='tanh')(x) 
    return Model([noise, labels], output)

def build_discriminator():
    model = tf.keras.Sequential()
    model.add(Dense(512, activation='relu', input_shape=(input_dim + num_classes,)))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model

generator = build_generator(latent_dim, num_classes)
discriminator = build_discriminator()

In [13]:
# Compile the discriminator
discriminator.compile(loss='binary_crossentropy', optimizer='adam')
generator.compile(loss='binary_crossentropy', optimizer='adam') 


In [14]:
# Create the combined generator-discriminator model
noise = Input(shape=(latent_dim,))
labels = Input(shape=(num_classes,)) 
fake = generator([noise, labels]) 
discriminator.trainable = False # Freeze discriminator weights during generator training

# Concatenate fake data and labels before feeding to discriminator
merged_data = Concatenate()([fake, labels])  # Concatenate here
valid = discriminator(merged_data)  # Pass the concatenated data

combined = Model([noise, labels], valid)
combined.compile(loss='binary_crossentropy', optimizer='adam')

In [18]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Concatenate
from tensorflow.keras.models import Model, Sequential

# ... (your previous code for generator, discriminator, etc.)

# Training loop
epochs = 100
batch_size = 128

# Wrap the discriminator in a Model to use train_on_batch
discriminator_input = Input(shape=(input_dim + num_classes,))  # Input layer
discriminator_output = discriminator(discriminator_input)
discriminator_model = Model(discriminator_input, discriminator_output)
discriminator_model.compile(loss='binary_crossentropy', optimizer='adam')

# Create the combined generator-discriminator model
noise = Input(shape=(latent_dim,))
labels = Input(shape=(num_classes,))
fake = generator([noise, labels])
discriminator.trainable = False  # Freeze discriminator weights during generator training

# Concatenate fake data and labels before feeding to discriminator
merged_data = Concatenate()([fake, labels])  # Concatenate here
valid = discriminator(merged_data)  # Pass the concatenated data

combined = Model([noise, labels], valid)
combined.compile(loss='binary_crossentropy', optimizer='adam')

# Ensure that the combined model is trainable
for layer in combined.layers:
    if layer != discriminator:  # Allow generator layers to be trained
        layer.trainable = True

for epoch in range(epochs):
    for i in range(X_train.shape[0] // batch_size):
        # Get a random batch of real samples
        idx = np.random.randint(0, X_train.shape[0], batch_size)
        
        # Use .iloc to select rows by their integer location
        real_data = X_train.iloc[idx].values  # Get values as NumPy array
        real_labels = y_train_one_hot[idx]

        # Generate fake samples
        noise = np.random.normal(0, 1, (batch_size, latent_dim))
        fake_samples = generator.predict([noise, real_labels])

        # Train discriminator on real and fake samples
        real_data_with_labels = np.concatenate([real_data, real_labels], axis=1)
        fake_data_with_labels = np.concatenate([fake_samples, real_labels], axis=1)

        # Use discriminator_model for train_on_batch
        d_loss_real = discriminator_model.train_on_batch(real_data_with_labels, np.ones((batch_size, 1)))
        d_loss_fake = discriminator_model.train_on_batch(fake_data_with_labels, np.zeros((batch_size, 1)))
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

        # Train generator (through the combined model)
        noise = np.random.normal(0, 1, (batch_size, latent_dim))
        g_loss = combined.train_on_batch([noise, real_labels], np.ones((batch_size, 1)))

        # Print progress
        # Access the first element of g_loss (assuming it's the loss value)
        print(f"{epoch + 1}, {i + 1}/{X_train.shape[0] // batch_size}, d={d_loss:.4f}, g={g_loss:.4f}")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
1, 1/3750, d=0.7431, g=0.5724
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
1, 2/3750, d=0.7645, g=0.5748
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
1, 3/3750, d=0.7665, g=0.5749
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
1, 4/3750, d=0.7686, g=0.5746
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
1, 5/3750, d=0.7687, g=0.5742
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
1, 6/3750, d=0.7688, g=0.5739
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
1, 7/3750, d=0.7691, g=0.5736
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
1, 8/3750, d=0.7690, g=0.5735
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
1, 9/3750, d=0.7695, g=0.5734
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
1, 10/3750, d=0

KeyboardInterrupt: 

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Concatenate
from tensorflow.keras.models import Model, Sequential

# ... (your previous code for generator, discriminator, etc.)

# Training loop
epochs = 100
batch_size = 128

# Wrap the discriminator in a Model to use train_on_batch
discriminator_input = Input(shape=(input_dim + num_classes,))  # Input layer
discriminator_output = discriminator(discriminator_input)
discriminator_model = Model(discriminator_input, discriminator_output)
discriminator_model.compile(loss='binary_crossentropy', optimizer='adam')

# Create the combined generator-discriminator model
noise = Input(shape=(latent_dim,))
labels = Input(shape=(num_classes,))
fake = generator([noise, labels])
discriminator.trainable = False  # Freeze discriminator weights during generator training

# Concatenate fake data and labels before feeding to discriminator
merged_data = Concatenate()([fake, labels])  # Concatenate here
valid = discriminator(merged_data)  # Pass the concatenated data

combined = Model([noise, labels], valid)
combined.compile(loss='binary_crossentropy', optimizer='adam')

# Ensure that the combined model is trainable
for layer in combined.layers:
    if layer != discriminator:  # Allow generator layers to be trained
        layer.trainable = True

for epoch in range(epochs):
    for i in range(X_train.shape[0] // batch_size):
        # ... (rest of your training loop)
        
        # Print progress
        # Access the g_loss directly, as it's a scalar
        print(f"{epoch + 1}, {i + 1}/{X_train.shape[0] // batch_size}, d={d_loss:.4f}, g={g_loss:.4f}")

In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Step 1: Load Dataset
csv_path = "data.csv"  # Replace with your dataset file
data = pd.read_csv(csv_path)
# Step 2: Feature Engineering
def hash_ip(ip):
    # Simple hash function for IP addresses
    return hash(ip) % (10**9)

def hash_protocol(protocol):
    # Convert protocol to hashable integer
    return hash(protocol)

# Apply transformations to relevant columns
data['src_ip_hash'] = data['IPV4_SRC_ADDR'].apply(hash_ip)
data['dst_ip_hash'] = data['IPV4_DST_ADDR'].apply(hash_ip)
data['protocol_hash'] = data['PROTOCOL'].apply(hash_protocol)

# Step 3: Select Relevant Features
selected_features = [
    "src_ip_hash", "dst_ip_hash", "protocol_hash",
    "IN_BYTES", "OUT_BYTES", "IN_PKTS", "OUT_PKTS",
    "TCP_FLAGS", "FLOW_DURATION_MILLISECONDS", "Label"
]
data = data[selected_features]

# Step 4: Handle Missing Values
data = data.dropna()  # Drop rows with missing values

# Step 5: Normalize Numerical Features to [0, 1]
numerical_features = [
    "src_ip_hash", "dst_ip_hash", "protocol_hash",
    "IN_BYTES", "OUT_BYTES", "IN_PKTS", "OUT_PKTS",
    "TCP_FLAGS", "FLOW_DURATION_MILLISECONDS"
]
scaler = MinMaxScaler(feature_range=(0, 1))
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Step 6: Encode Labels
label_encoder = LabelEncoder()
data["Label"] = label_encoder.fit_transform(data["Label"])  # Convert labels to integers


In [3]:
data.head()


Unnamed: 0,src_ip_hash,dst_ip_hash,protocol_hash,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label
0,1.0,0.038831,1.0,1.884118e-07,5.180502e-07,0.0,1.6e-05,0.0,1.0,0
1,1.0,0.71107,0.3125,0.9541218,0.0008186015,0.119526,0.066258,0.11215,0.972359,1
2,0.789448,0.471248,1.0,0.03727922,0.03666797,0.240242,0.148683,0.0,0.972281,0
3,0.799714,0.820576,0.3125,0.03699055,0.0370587,0.240242,0.148683,0.0,0.972281,0
4,0.789448,0.471248,0.3125,0.03669508,0.0,0.240242,0.0,0.0,0.972281,0
