In [1]:
# Notebook: contrastive_tabular_simclr.ipynb
# Requirements: torch, torchvision, sklearn, pandas, numpy, tqdm, matplotlib

import os, random
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Create folder
!mkdir -p /content/data/nsl_kdd

# Download NSL-KDD train/test text files
!wget -q -O /content/data/nsl_kdd/KDDTrain+.txt https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain+.txt
!wget -q -O /content/data/nsl_kdd/KDDTest+.txt  https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTest+.txt

In [None]:
# 1. Load dataset (same assumption as Notebook 1)
DATA_PATH = "/content/data/nsl_kdd/KDDTrain+.txt"  # replace
# Load the dataset without a header
df = pd.read_csv(DATA_PATH, header=None)
# The second to last column is the attack type label (index -2)
label_col_index = df.columns[-2]

# Print unique values in the original label column to help diagnose
print("Unique original labels:", df[label_col_index].unique())

# Map 'normal' and 'normal.' to 0, and all other labels to 1
# Strip whitespace and convert to lower case for more robust matching
df['binary_label'] = df[label_col_index].apply(lambda x: 0 if str(x).strip().lower() in ['normal', 'normal.'] else 1)

# Print the count of each binary label to confirm both classes are present
print("Binary label counts:\n", df['binary_label'].value_counts())

Unique original labels: ['normal' 'neptune' 'warezclient' 'ipsweep' 'portsweep' 'teardrop' 'nmap'
 'satan' 'smurf' 'pod' 'back' 'guess_passwd' 'ftp_write' 'multihop'
 'rootkit' 'buffer_overflow' 'imap' 'warezmaster' 'phf' 'land'
 'loadmodule' 'spy' 'perl']
Binary label counts:
 binary_label
0    67343
1    58630
Name: count, dtype: int64


In [None]:
# 2. Preprocess (scale numerics, one-hot category)
# Explicitly define known categorical columns for NSL-KDD
cat_cols = [1, 2, 3] # protocol_type, service, flag

# Assuming all other feature columns (0-37 excluding the label and the explicit cat cols) are numeric
all_features_indices = list(range(38))
num_cols = [col for col in all_features_indices if col not in cat_cols]

# Get the actual column names from the indices - this step is not needed if we use indices directly in ColumnTransformer
# num_col_names = df.columns[num_cols].tolist()
# cat_col_names = df.columns[cat_cols].tolist()

transformer = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
], remainder='drop')

# Select only the feature columns (0-37) for transformation
X_all = transformer.fit_transform(df[all_features_indices])
y_all = df['binary_label'].values
print("Feature dim:", X_all.shape)

Feature dim: (125973, 119)


In [None]:
# 3. Create PyTorch Dataset with simple augmentations for contrastive learning
class TabularContrastiveDataset(Dataset):
    def __init__(self, X):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.n = self.X.shape[0]
    def __len__(self):
        return self.n
    def augment(self, x):
        # Two simple augmentations: gaussian noise and feature dropout (mask)
        x = x.clone()
        # gaussian noise
        noise = torch.randn_like(x) * 0.01
        x = x + noise
        # random feature mask
        mask = (torch.rand_like(x) > 0.1).float()  # 10% drop
        x = x * mask
        return x
    def __getitem__(self, idx):
        x = self.X[idx]
        xi = self.augment(x)
        xj = self.augment(x)
        return xi, xj

In [None]:
# 4. Tiny MLP encoder and projection head
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, proj_dim=64):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU()
        )
        self.projector = nn.Sequential(
            nn.Linear(hidden_dim, proj_dim),
            nn.ReLU(),
            nn.Linear(proj_dim, proj_dim)
        )
    def forward(self, x):
        h = self.net(x)
        z = self.projector(h)
        return h, z

In [None]:
# 5. NT-Xent contrastive loss implementation
def nt_xent_loss(z_i, z_j, temperature=0.5):
    # z_i, z_j: (batch_size, proj_dim)
    batch_size = z_i.shape[0]
    z = torch.cat([z_i, z_j], dim=0)  # 2N x d
    z = nn.functional.normalize(z, dim=1)
    sim = torch.matmul(z, z.T)  # 2N x 2N
    # create mask to remove similarity with itself
    mask = (~torch.eye(2*batch_size, dtype=bool)).to(z.device)
    positives = torch.cat([torch.diag(sim, batch_size), torch.diag(sim, -batch_size)], dim=0)
    nom = torch.exp(positives / temperature)
    denom = torch.sum(torch.exp(sim / temperature) * mask.float(), dim=1)
    loss = -torch.log(nom / denom)
    return loss.mean()

In [None]:
# 6. Prepare dataset loaders (use only training split for SSL)
X_train_ssl, X_rest, y_train_ssl, y_rest = train_test_split(X_all, y_all, test_size=0.5, random_state=42, stratify=y_all)

# Check if X_train_ssl is a sparse matrix and convert to dense if necessary
if hasattr(X_train_ssl, 'toarray'):
    X_train_ssl_dense = X_train_ssl.toarray()
else:
    X_train_ssl_dense = X_train_ssl

ssl_dataset = TabularContrastiveDataset(X_train_ssl_dense)

ssl_loader = DataLoader(ssl_dataset, batch_size=256, shuffle=True, drop_last=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder = Encoder(input_dim=X_all.shape[1], hidden_dim=256, proj_dim=64).to(device)
optimizer = optim.Adam(encoder.parameters(), lr=1e-3, weight_decay=1e-6)

In [None]:
# 7. Train encoder with contrastive loss
epochs = 30
for epoch in range(epochs):
    encoder.train()
    total_loss = 0.0
    for xi, xj in ssl_loader:
        xi = xi.to(device); xj = xj.to(device)
        _, zi = encoder(xi)
        _, zj = encoder(xj)
        loss = nt_xent_loss(zi, zj, temperature=0.5)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg = total_loss / len(ssl_loader)
    print(f"Epoch {epoch+1}/{epochs} - contrastive loss: {avg:.4f}")

Epoch 1/30 - contrastive loss: 4.6214
Epoch 2/30 - contrastive loss: 4.5303
Epoch 3/30 - contrastive loss: 4.5100
Epoch 4/30 - contrastive loss: 4.4958
Epoch 5/30 - contrastive loss: 4.4890
Epoch 6/30 - contrastive loss: 4.4781
Epoch 7/30 - contrastive loss: 4.4728
Epoch 8/30 - contrastive loss: 4.4690
Epoch 9/30 - contrastive loss: 4.4650
Epoch 10/30 - contrastive loss: 4.4635
Epoch 11/30 - contrastive loss: 4.4615
Epoch 12/30 - contrastive loss: 4.4549
Epoch 13/30 - contrastive loss: 4.4543
Epoch 14/30 - contrastive loss: 4.4509
Epoch 15/30 - contrastive loss: 4.4490
Epoch 16/30 - contrastive loss: 4.4479
Epoch 17/30 - contrastive loss: 4.4425
Epoch 18/30 - contrastive loss: 4.4423
Epoch 19/30 - contrastive loss: 4.4390
Epoch 20/30 - contrastive loss: 4.4360
Epoch 21/30 - contrastive loss: 4.4357
Epoch 22/30 - contrastive loss: 4.4333
Epoch 23/30 - contrastive loss: 4.4309
Epoch 24/30 - contrastive loss: 4.4289
Epoch 25/30 - contrastive loss: 4.4287
Epoch 26/30 - contrastive loss: 4.

In [None]:
# 8. Build representation for downstream classification (freeze encoder)
encoder.eval()
with torch.no_grad():
    X_repr_list = [] # Use a different name to avoid confusion with the final X_repr
    batch_size = 512
    # Check if X_all is sparse
    is_sparse = hasattr(X_all, 'todense')

    for i in range(0, X_all.shape[0], batch_size):
        xb_slice = X_all[i:i+batch_size]
        if is_sparse:
            # Convert the sparse slice to a dense NumPy array before creating the tensor
            xb_dense = xb_slice.todense()
        else:
            # If X_all is already dense, use the slice directly
            xb_dense = xb_slice

        xb = torch.tensor(xb_dense, dtype=torch.float32).to(device)
        h, z = encoder(xb)
        # use normalized z as representation
        X_repr_list.append(nn.functional.normalize(z, dim=1).cpu().numpy())

# Stack the list of arrays into a single NumPy array
if X_repr_list: # Check if the list is not empty
    X_repr = np.vstack(X_repr_list)
else:
    X_repr = np.array([]) # Create an empty numpy array if the list is empty
    print("Warning: X_repr_list is empty. No representations were generated.")

# Optional: Print the shape of X_repr to verify
print(f"Shape of generated X_repr: {X_repr.shape}")

Shape of generated X_repr: (125973, 64)


In [None]:
# 9. Train/evaluate a classifier (LogisticRegression and RandomForest for comparison)

# Check shapes before splitting
print(f"Shape of X_repr: {X_repr.shape}")
print(f"Shape of y_all: {y_all.shape}")

if X_repr.shape[0] != y_all.shape[0]:
    print("Error: Number of samples in X_repr and y_all do not match.")
    print("Please check the representation building step (cell 8) to ensure X_repr is generated correctly.")
elif len(np.unique(y_all)) < 2:
     print("Error: The target variable y_all contains only one class. Cannot train a binary classifier.")
     print("Please check your data loading and preprocessing steps to ensure both classes are present in the original data.")
else:
    X_train, X_test, y_train, y_test = train_test_split(X_repr, y_all, test_size=0.3, random_state=42, stratify=y_all)

    # Check if there are at least two classes in y_train after splitting
    if len(np.unique(y_train)) < 2:
        print("Error: The training data (y_train) contains only one class after splitting. Cannot train a binary classifier.")
        print("Please check the stratification in the train_test_split or the original y_all.")
    else:
        clf = LogisticRegression(max_iter=1000)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print("Logistic Regression on contrastive reps")
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred, digits=4))
        print("ROC AUC:", roc_auc_score(y_test, clf.predict_proba(X_test)[:,1]))

        # You can add the RandomForestClassifier evaluation here as well if needed
        # clf_rf = RandomForestClassifier(n_estimators=100, random_state=42)
        # clf_rf.fit(X_train, y_train)
        # y_pred_rf = clf_rf.predict(X_test)
        # print("\nRandom Forest on contrastive reps")
        # print("Accuracy:", accuracy_score(y_test, y_pred_rf))
        # print(classification_report(y_test, y_pred_rf, digits=4))
        # print("ROC AUC:", roc_auc_score(y_test, clf_rf.predict_proba(X_test)[:,1]))

Shape of X_repr: (125973, 64)
Shape of y_all: (125973,)
Logistic Regression on contrastive reps
Accuracy: 0.9739627434377646
              precision    recall  f1-score   support

           0     0.9704    0.9812    0.9758     20203
           1     0.9782    0.9656    0.9718     17589

    accuracy                         0.9740     37792
   macro avg     0.9743    0.9734    0.9738     37792
weighted avg     0.9740    0.9740    0.9740     37792

ROC AUC: 0.9967891060097844
