In [5]:
import torch 
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import DataLoader,TensorDataset
import torch.nn.functional as F
import torch.nn as nn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import random as rand
from sklearn.preprocessing import LabelEncoder


## data preparation

In [6]:
data=pd.read_csv("/kaggle/input/ctu-13/CTU_13.csv")
le = LabelEncoder()
lst = []
for i in data['Label']:
        if 'Botnet' in i:
            lst.append(1)
        elif 'Normal' in i:
            lst.append(2)
        else:
            lst.append(0)
data['Label']=lst


  data=pd.read_csv("/kaggle/input/ctu-13/CTU_13.csv")


In [26]:

data=data[data['Label']!=0]
direction_number = []
for i in data['Dir']:
        if i == '  <->':
            direction_number.append(1)
        elif i == '   ->':
            direction_number.append(2)
        else:
            direction_number.append(0)
data['Dir'] = direction_number
dataset=data.drop(columns=['Unnamed: 0'],inplace=False)
dataset.dropna(inplace=True)
dt = pd.to_datetime(dataset['StartTime'], format='%Y/%m/%d %H:%M:%S.%f')
df = pd.DataFrame({'datetime': dt})
dataset['year'] = df['datetime'].dt.year
dataset['month'] = df['datetime'].dt.month
dataset['day'] = df['datetime'].dt.day
dataset['hour'] = df['datetime'].dt.hour
dataset['minute'] = df['datetime'].dt.minute
dataset['second'] = df['datetime'].dt.second
dataset['dayofweek'] = df['datetime'].dt.dayofweek  # Monday=0, Sunday=6
dataset['is_weekend'] = dataset['dayofweek'].isin([5,6]).astype(int)
dataset.drop(columns=['StartTime'],inplace=True)
#fixed sprint2 added the packetlength column
dataset['packet_length'] = dataset['TotBytes'] / dataset['TotPkts']
cat_cols=['Proto','SrcAddr','DstAddr','Dport','State']
for col in cat_cols:
        numeric_labels = le.fit_transform(dataset[col])
        dataset[col]=numeric_labels
numeric_labels = le.fit_transform(dataset['Sport'].astype('str'))
dataset['Sport']=numeric_labels
total_len = len(dataset)
datasetlabeled=dataset.copy()
dataset.drop(columns=['Label'],inplace=True)
# 80% for encoder (unlabeled)
encoder_data = dataset[:int(total_len * 0.8)]
# Next 10% for linear classifier training (labeled)
lineardata = datasetlabeled[int(total_len * 0.8):int(total_len * 0.9)]
# Final 10% for evaluation (labeled)
datasetevaluation = datasetlabeled[int(total_len * 0.9):]



## architecture of byol

In [58]:
#inline network
class encoder(nn.Module):
    def __init__(self, input_size=22, hidden1=256, hidden2=128, output_size=64):
        super().__init__()
        #aka a simple mlp
        self.fc1=nn.Linear(input_size,hidden1)
        self.fc2=nn.Linear(hidden1,hidden2)
        self.fc3=nn.Linear(hidden2,output_size)
        self.act=nn.ReLU()
    def forward(self,x):
        x=self.fc1(x)
        x=self.fc2(self.act(x))
        return self.fc3(x)
# Projector: small MLP
def projector_fn(input_dim=64, proj_dim=128, hidden_dim=256):
    return nn.Sequential(
        nn.Linear(input_dim, hidden_dim),
        nn.BatchNorm1d(hidden_dim),
        nn.ReLU(),
        nn.Linear(hidden_dim, proj_dim)
    )

# Predictor: small MLP (only online network has this)
def predictor_fn(proj_dim=128, hidden_dim=256):
    return nn.Sequential(
        nn.Linear(proj_dim, hidden_dim),
        nn.BatchNorm1d(hidden_dim),
        nn.ReLU(),
        nn.Linear(hidden_dim, proj_dim)
    )
class Online(nn.Module):
    def __init__(self,input_size=22,hidden_size=256,output_size=64,proj_dim=128):
        super().__init__()
        self.encoder=encoder(input_size=input_size, hidden1=hidden_size, hidden2=128, output_size=output_size)
        self.projector = projector_fn(output_size, proj_dim)
        self.predictor = predictor_fn(proj_dim)
    def forward(self,x,return_features=False):
        h=self.encoder(x)
        z=self.projector(h)
        p=self.predictor(z)
        if return_features:
            return h
        return z,p
class Target(nn.Module):
    def __init__(self,input_size=22,hidden_size=256,output_size=64,proj_dim=128):
        super().__init__()
        self.encoder=encoder(input_size=input_size, hidden1=hidden_size, hidden2=128, output_size=output_size)
        self.projector = projector_fn(output_size, proj_dim)
    def forward(self,x):
        x=self.encoder(x)
        x=self.projector(x)
        return x

## noise function

In [50]:
def noisify(data):
    data = data.copy()
    np.random.seed(42)

    # Random packet loss or dropped features
    nb_features = len(data.columns)
    num_drops = rand.randint(1, 3)

    # the indexes to the random drops
    drop_indices = np.random.randint(0, nb_features, size=(len(data), num_drops))

    # cool way to do it
    rows = np.repeat(np.arange(len(data)), num_drops)
    cols = drop_indices.flatten()
    data.values[rows, cols] = np.nan  # when i do nan it gets me erro later , until i figure out how
                                # not have to deal with that and fix the masked loss function IT IS STAYING 0000

    # Swapping Source/Destination IPs
    # fixed sprint2 no changing column places
    # Randomly pick 10% of rows to swap
    mask = np.random.rand(len(data)) < 0.1
    data.loc[mask, ['SrcAddr', 'DstAddr']] = data.loc[mask, ['DstAddr', 'SrcAddr']].values

    # Changing Protocol Numbers (TCP → UDP)
    protocol_map = {6: 17, 17: 6}  # TCP <-> UDP
    # Apply noise to 5-10% of rows
    mask = np.random.rand(len(data)) < 0.05
    data.loc[mask, 'Proto'] = data.loc[mask, 'Proto'].map(lambda x: protocol_map.get(x, x))

    # Slightly Modifying Packet Lengths or Time Intervals
    # fixed sprint2 those tow lignes are useless
    # data['packet_length'] = data['TotBytes'] / data['TotPkts']
    # data['packet_length'].replace(np.nan,0)
    noise_fraction = 0.05
    mask = np.random.rand(len(data)) < 0.1  # apply to 10% of rows
    # data['packet_length'] = data['TotBytes'] / data['TotPkts']
    data.loc[mask, 'packet_length'] = data.loc[mask, 'packet_length'] * (
        1 + np.random.uniform(-noise_fraction, noise_fraction, mask.sum())
    )

    # Modify timestamps by a small jitter
    # to fix see if keep adjusted time per second only or more
    # just in seconds cause minute +- is too much
    time_jitter = 2  # seconds
    mask = np.random.rand(len(data)) < 0.1
    data.loc[mask, 'second'] = data.loc[mask, 'second'] + np.random.uniform(
        -time_jitter, time_jitter, mask.sum()
    )

    # feature level noise
    row_noise_fraction = 0.1  # 10%
    # Standard deviation as a fraction of the original value
    std_fraction = 0.05  # ±5%
    for col in data.columns:
        mask = np.random.rand(len(data)) < row_noise_fraction
        noise = data.loc[mask, col] * np.random.normal(0, std_fraction, mask.sum())
        data.loc[mask, col] = data.loc[mask, col] + noise

    return data


In [51]:
train_encoder=noisify(encoder_data)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [52]:
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

encoder_data = scaler.fit_transform(encoder_data)
train_encoder = scaler.fit_transform(train_encoder)


In [47]:
print(encoder_data.shape,train_encoder.shape)

(451032, 22) (451032, 22)


In [38]:
X_tensor = torch.tensor(encoder_data, dtype=torch.float32) 
X_tensor2 = torch.tensor(train_encoder, dtype=torch.float32) 


In [39]:
flow_dataset = TensorDataset(X_tensor,X_tensor2) 


In [40]:
dataloader = DataLoader(flow_dataset, batch_size=256, shuffle=True)


In [41]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [59]:
onlinenn=Online().to(device)
targetnn=Target().to(device)

In [60]:
optimizeronline = torch.optim.Adam(list(onlinenn.parameters()) , lr=1e-3)


In [61]:
@torch.no_grad()
def update_target(online, target, m=0.99):  # momentum = 0.99
    for param_o, param_t in zip(online.parameters(), target.parameters()):
        param_t.data = m * param_t.data + (1 - m) * param_o.data

In [62]:
import torch.nn.functional as F


In [64]:
def byol_loss(p, z): #negative cosine similarity (official BYOL paper)
    p = F.normalize(p, dim=-1)
    z = F.normalize(z, dim=-1)
    return - (p * z).sum(dim=-1).mean()

In [65]:
epochs=20
for epoch in range(epochs):
    totalloss=0
    for batch,batchnoise in dataloader:
        batch = batch.to(device)
        batchnoise = batchnoise.to(device)
        x1, x2 = batch,batchnoise

        
        z1, p1 = onlinenn(x1)
        z2, p2 = onlinenn(x2)
        t1 = targetnn(x1)
        t2 = targetnn(x2)
        
        loss = byol_loss(p1, t2) + byol_loss(p2, t1)
        
        optimizeronline.zero_grad()
        loss.backward()
        optimizeronline.step()
        update_target(onlinenn, targetnn)
        totalloss+=loss
    avgloss=totalloss/(len(dataloader))
    print(f"Epoch {epoch+1} loss {avgloss}")

Epoch 1 loss -1.9731650352478027
Epoch 2 loss -1.989047646522522
Epoch 3 loss -1.9889308214187622
Epoch 4 loss -1.9879230260849
Epoch 5 loss -1.9885845184326172
Epoch 6 loss -1.9894994497299194
Epoch 7 loss -1.9886245727539062
Epoch 8 loss -1.9891859292984009
Epoch 9 loss -1.989040732383728
Epoch 10 loss -1.988132119178772
Epoch 11 loss -1.9878476858139038
Epoch 12 loss -1.9878345727920532
Epoch 13 loss -1.9876519441604614
Epoch 14 loss -1.9874306917190552
Epoch 15 loss -1.986860752105713
Epoch 16 loss -1.986513376235962
Epoch 17 loss -1.985713005065918
Epoch 18 loss -1.9843149185180664
Epoch 19 loss -1.982336163520813
Epoch 20 loss -1.9798684120178223


### eval

## random forest

In [75]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)


In [77]:
X_labeled=lineardata.drop(columns=['Label'],inplace=False)
y_labels=lineardata['Label']
y_labels = torch.tensor(y_labels.values, dtype=torch.long)

In [78]:
unique_labels = torch.unique(y_labels)
label_map = {old.item(): new for new, old in enumerate(unique_labels)}
y_labels = torch.tensor([label_map[int(l)] for l in y_labels], dtype=torch.long)
X_tensor = torch.tensor(X_labeled.values, dtype=torch.float32).to(device)
      

In [79]:
with torch.no_grad():
    embeddings = onlinenn(X_tensor,True)

In [80]:
rf.fit(embeddings, y_labels)

## eval

In [82]:
X_eval = datasetevaluation.drop(columns=['Label']).select_dtypes(include=[np.number])
y_eval = datasetevaluation['Label'].map(label_map)

X_eval_tensor = torch.tensor(X_eval.values, dtype=torch.float32).to(device)

with torch.no_grad():
    eval_embeddings = onlinenn.encoder(X_eval_tensor).cpu().numpy()
from sklearn.metrics import accuracy_score, classification_report

y_pred = rf.predict(eval_embeddings)

print("Accuracy:", accuracy_score(y_eval, y_pred))
print("Classification Report:\n", classification_report(y_eval, y_pred))


Accuracy: 0.9586902924847904
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.93      0.96     27352
           1       0.94      0.98      0.96     29027

    accuracy                           0.96     56379
   macro avg       0.96      0.96      0.96     56379
weighted avg       0.96      0.96      0.96     56379

