In [2]:
import pandas as pd

# Load NSL-KDD
nsl_train = pd.read_csv("/content/KDDTrain+.txt", header=None)
nsl_test = pd.read_csv("/content/KDDTest+.txt", header=None)

nsl_columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
    'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
    'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
    'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate',
    'attack_category', 'label'
]
nsl_train.columns = nsl_columns
nsl_test.columns = nsl_columns

# Load UNSW-NB15
unsw_train = pd.read_csv("/content/UNSW_NB15_training-set.csv")
unsw_test = pd.read_csv("/content/UNSW_NB15_testing-set.csv")

unsw_columns = ['id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat', 'label']
unsw_test = unsw_test[unsw_columns]

# Rename and cleanup
unsw_train.rename(columns={'dur': 'duration', 'proto': 'protocol_type', 'attack_cat': 'attack_category'}, inplace=True)
unsw_test.rename(columns={'dur': 'duration', 'proto': 'protocol_type', 'attack_cat': 'attack_category'}, inplace=True)
unsw_train.drop(columns=['id'], inplace=True)
unsw_test.drop(columns=['id'], inplace=True)

# Merge all data
nsl_data = pd.concat([nsl_train, nsl_test], axis=0).reset_index(drop=True)
unsw_data = pd.concat([unsw_train, unsw_test], axis=0).reset_index(drop=True)

df = pd.concat([nsl_data, unsw_data], axis=0, join="outer").reset_index(drop=True)

# Backup attack_category if exists
if 'attack_category' in df.columns:
    labels = df['attack_category']
else:
    labels = None


In [None]:
# Fill NaNs in categorical columns
for col in ['protocol_type', 'service', 'flag', 'state', 'attack_category']:
    if col in df.columns:
        df[col].fillna('unknown', inplace=True)

# Fill numerical NaNs
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[num_cols] = df[num_cols].fillna(0)

# Drop columns with >50% missing values
df = df.loc[:, df.isnull().mean() < 0.5]

# Remove non-numeric columns temporarily for encoding
non_numeric_cols = df.select_dtypes(exclude=['number']).columns.tolist()
print(f"Non-numeric columns: {non_numeric_cols}")


In [None]:
# Safely fill NaNs in categorical columns
for col in ['protocol_type', 'service', 'flag', 'attack_category', 'state']:
    if col in df.columns:
        df[col] = df[col].fillna('unknown')  # ✅ No inplace=True

# Fill numerical NaNs
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[num_cols] = df[num_cols].fillna(0)

# Drop columns with >50% missing values
df = df.loc[:, df.isnull().mean() < 0.5]

# Get non-numeric columns for encoding
non_numeric_cols = df.select_dtypes(exclude=['number']).columns.tolist()
print(f"🧾 Non-numeric columns: {non_numeric_cols}")


In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# ➤ Step 1: Handle missing values
for col in ['protocol_type', 'service', 'flag', 'attack_category', 'state']:
    if col in df.columns:
        df.loc[:, col] = df[col].fillna('unknown')

# Fill numerical NaNs
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
df.loc[:, num_cols] = df[num_cols].fillna(0)

# Drop columns with >50% missing values
df = df.loc[:, df.isnull().mean() < 0.5]

# ➤ Step 2: Preserve attack_category
if 'attack_category' in df.columns:
    attack_category = df['attack_category'].copy()
    print("✅ attack_category preserved separately")

# ➤ Step 3: Copy features for transformation
X = df.drop(columns=['attack_category']) if 'attack_category' in df.columns else df.copy()

# ➤ Step 4: One-hot encode 'protocol_type', 'flag', 'state'
categorical_cols = [col for col in ['protocol_type', 'flag', 'state'] if col in X.columns]
if categorical_cols:
    X = pd.get_dummies(X, columns=categorical_cols, dtype=int)
    print(f"✅ One-hot encoded: {categorical_cols}")

# ➤ Step 5: Frequency encode 'service'
if 'service' in X.columns:
    service_counts = X['service'].value_counts().to_dict()
    X['service'] = X['service'].map(service_counts)
    print("✅ Frequency encoded: service")

# ➤ Step 6: Double-check for non-numeric leftovers
non_numeric_cols = X.select_dtypes(exclude=['number']).columns.tolist()
if non_numeric_cols:
    print(f"⚠️ Dropping unexpected non-numeric cols: {non_numeric_cols}")
    X.drop(columns=non_numeric_cols, inplace=True)

# ➤ Step 7: Min-Max Scaling
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

print("✅ Final Scaling Done!")
print(f"🔢 Final shape for Autoencoder: {X_scaled.shape}")


In [None]:
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

input_dim = X_scaled.shape[1]
encoding_dim = 64  # You can tune this

# Input layer
input_layer = layers.Input(shape=(input_dim,))

# Encoder
encoded = layers.Dense(256, activation='relu')(input_layer)
encoded = layers.Dense(128, activation='relu')(encoded)
bottleneck = layers.Dense(encoding_dim, activation='relu', name='bottleneck')(encoded)

# Decoder
decoded = layers.Dense(128, activation='relu')(bottleneck)
decoded = layers.Dense(256, activation='relu')(decoded)
output_layer = layers.Dense(input_dim, activation='sigmoid')(decoded)

# Autoencoder model
autoencoder = Model(inputs=input_layer, outputs=output_layer)
autoencoder.compile(optimizer=optimizers.Adam(learning_rate=0.001), loss='mse')

# Encoder model for feature extraction
encoder = Model(inputs=input_layer, outputs=bottleneck)

# Train autoencoder
history = autoencoder.fit(
    X_scaled, X_scaled,
    epochs=50,
    batch_size=1024,
    shuffle=True,
    validation_split=0.2,
    callbacks=[EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)],
    verbose=1
)
X_compressed = encoder.predict(X_scaled)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_compressed, y, test_size=0.2, random_state=42)

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

xgb = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.05)
xgb.fit(X_train, y_train)

preds = xgb.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))