In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
import numpy as np

# Load the NSL-KDD training and testing datasets
train_data = pd.read_csv('KDDTrain+.txt', header=None)
test_data = pd.read_csv('KDDTest+.txt', header=None)

# Print the first few rows of the datasets to inspect the structure
print(train_data.head())
print(test_data.head())

# Define column names for the NSL-KDD dataset
columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
    'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
    'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
    'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
    'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label'
]

# Adjust if there is an extra column by removing the last column or verifying splitting
if train_data.shape[1] == 43 and test_data.shape[1] == 43:
    train_data.drop(columns=[train_data.columns[-1]], inplace=True)
    test_data.drop(columns=[test_data.columns[-1]], inplace=True)
    train_data.columns = columns
    test_data.columns = columns
else:
    print("Column length mismatch persists. Further inspection needed.")

# Proceed only if column names are correctly assigned
if 'label' in train_data.columns and 'label' in test_data.columns:
    # Separate features and labels
    X_train = train_data.drop(columns=['label'])
    X_test = test_data.drop(columns=['label'])

    # Preprocess categorical features
    categorical_features = ['protocol_type', 'service', 'flag']
    numeric_features = X_train.columns.difference(categorical_features)

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(), categorical_features)
        ])

    # Create a preprocessing and training pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

    # Fit and transform the training data
    X_train_scaled = pipeline.fit_transform(X_train)
    X_test_scaled = pipeline.transform(X_test)

    # Define the autoencoder model
    input_dim = X_train_scaled.shape[1]
    encoding_dim = 14  # Dimension of the latent space

    input_layer = Input(shape=(input_dim,))
    encoder = Dense(encoding_dim, activation='relu')(input_layer)
    decoder = Dense(input_dim, activation='sigmoid')(encoder)

    autoencoder = Model(inputs=input_layer, outputs=decoder)
    autoencoder.compile(optimizer='adam', loss='mean_squared_error')

    # Train the model
    history = autoencoder.fit(X_train_scaled, X_train_scaled,
                              epochs=50,
                              batch_size=256,
                              shuffle=True,
                              validation_data=(X_test_scaled, X_test_scaled))

    # Predict on the test set
    X_test_pred = autoencoder.predict(X_test_scaled)

    # Calculate the reconstruction error
    mse = np.mean(np.power(X_test_scaled - X_test_pred, 2), axis=1)

    # Set a threshold for anomaly detection (95th percentile)
    threshold = np.percentile(mse, 95)

    # Flag anomalies
    anomalies = mse > threshold

    # Print the results
    print(f"Number of anomalies detected: {np.sum(anomalies)}")
else:
    print("Column assignment failed. Please check the dataset and column names.")


   0    1         2   3    4     5   6   7   8   9   ...    33    34    35  \
0   0  tcp  ftp_data  SF  491     0   0   0   0   0  ...  0.17  0.03  0.17   
1   0  udp     other  SF  146     0   0   0   0   0  ...  0.00  0.60  0.88   
2   0  tcp   private  S0    0     0   0   0   0   0  ...  0.10  0.05  0.00   
3   0  tcp      http  SF  232  8153   0   0   0   0  ...  1.00  0.00  0.03   
4   0  tcp      http  SF  199   420   0   0   0   0  ...  1.00  0.00  0.00   

     36    37    38    39    40       41  42  
0  0.00  0.00  0.00  0.05  0.00   normal  20  
1  0.00  0.00  0.00  0.00  0.00   normal  15  
2  0.00  1.00  1.00  0.00  0.00  neptune  19  
3  0.04  0.03  0.01  0.00  0.01   normal  21  
4  0.00  0.00  0.00  0.00  0.00   normal  21  

[5 rows x 43 columns]
   0     1         2     3      4   5   6   7   8   9   ...    33    34    35  \
0   0   tcp   private   REJ      0   0   0   0   0   0  ...  0.04  0.06  0.00   
1   0   tcp   private   REJ      0   0   0   0   0   0  ...  0.0

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

# Define the autoencoder model
input_dim = X_train_scaled.shape[1]
encoding_dim = 14  # Dimension of the latent space

input_layer = Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation='relu')(input_layer)
decoder = Dense(input_dim, activation='sigmoid')(encoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = autoencoder.fit(X_train_scaled, X_train_scaled,
                          epochs=50,
                          batch_size=256,
                          shuffle=True,
                          validation_data=(X_test_scaled, X_test_scaled))
