Time series classification of anomaly detections
====================================================

In [16]:
import torch

import copy
import numpy as np
import pandas as pd

from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split

from torch import nn, optim

import torch.nn.functional as F
from sklearn.preprocessing import OneHotEncoder



In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x20716b9b9f0>

Data Mining
==============

In [18]:
data = pd.read_csv('benign.csv')
df = pd.DataFrame(data)
print(df.head())

   No.                                      Info      Time  \
0    1  Who has 192.168.1.207? Tell 192.168.1.81  0.000000   
1    2                      M-SEARCH * HTTP/1.1   0.614779   
2    3    Who has 192.168.1.6? Tell 192.168.1.81  0.614786   
3    4  Who has 192.168.1.207? Tell 192.168.1.81  1.228787   
4    5                      M-SEARCH * HTTP/1.1   1.536441   

                  Source      Destination Protocol  Length  
0  PCSSystemtec_45:5c:08        Broadcast      ARP      60  
1            192.168.1.9  239.255.255.250     SSDP     218  
2  PCSSystemtec_45:5c:08        Broadcast      ARP      60  
3  PCSSystemtec_45:5c:08        Broadcast      ARP      60  
4            192.168.1.9  239.255.255.250     SSDP     218  


In [19]:
unique_protocol_count = df['Protocol'].unique()
print(unique_protocol_count)

['ARP' 'SSDP' 'UDP' 'SSL' 'TCP' 'ICMPv6' 'TLSv1.2' 'STP' 'SIGCOMP' 'DNS'
 'DHCP' 'NTP' 'CLASSIC-STUN' 'LLDP' 'TLSv1.3' 'MDNS' 'HTTP' 'ICMP'
 'DHCPv6' 'CDP' 'LLMNR' 'LLC' 'HTTP/XML' 'QUIC' 'IGMPv3' 'NBNS' 'BROWSER'
 'GQUIC' 'IGMPv2' 'TPLINK-SMARTHOME/JSON' 'OCSP']


In [20]:
df['Source'] = df['Source'].str.replace('.', '')
df['Destination'] = df['Destination'].str.replace('.', '')

# Mostrar las primeras filas del DataFrame con la columna 'Source' modificada
print(df.head())

   No.                                      Info      Time  \
0    1  Who has 192.168.1.207? Tell 192.168.1.81  0.000000   
1    2                      M-SEARCH * HTTP/1.1   0.614779   
2    3    Who has 192.168.1.6? Tell 192.168.1.81  0.614786   
3    4  Who has 192.168.1.207? Tell 192.168.1.81  1.228787   
4    5                      M-SEARCH * HTTP/1.1   1.536441   

                  Source   Destination Protocol  Length  
0  PCSSystemtec_45:5c:08     Broadcast      ARP      60  
1               19216819  239255255250     SSDP     218  
2  PCSSystemtec_45:5c:08     Broadcast      ARP      60  
3  PCSSystemtec_45:5c:08     Broadcast      ARP      60  
4               19216819  239255255250     SSDP     218  


  df['Source'] = df['Source'].str.replace('.', '')
  df['Destination'] = df['Destination'].str.replace('.', '')


In [21]:
protocol_mapping = {
    'TCP': 1,
    'UDP': 2,
    'HTTP': 3,
    'HTTPS': 4,
    'FTP': 5,
    'SMTP': 6,
    'POP3': 7,
    'IMAP': 8,
    'SNMP': 9,
    'DHCP': 10,
    'DNS': 11,
    'ICMP':12
}

df['Protocol'] = df['Protocol'].replace(protocol_mapping)
df = df.drop(columns=['Info','No.'])

# Mostrar las primeras filas del DataFrame con la columna 'Protocol' modificada
print(df.head())

       Time                 Source   Destination Protocol  Length
0  0.000000  PCSSystemtec_45:5c:08     Broadcast      ARP      60
1  0.614779               19216819  239255255250     SSDP     218
2  0.614786  PCSSystemtec_45:5c:08     Broadcast      ARP      60
3  1.228787  PCSSystemtec_45:5c:08     Broadcast      ARP      60
4  1.536441               19216819  239255255250     SSDP     218


We normalize our dataset
=====================

In [22]:
from sklearn.preprocessing import MinMaxScaler

# Suponiendo que 'df' es tu DataFrame y 'columns_to_normalize' es una lista de columnas a normalizar
columns_to_normalize = ['Time', 'Source', 'Destination','Length']  # Lista de columnas a normalizar

scaler = MinMaxScaler()  # Inicializar el MinMaxScaler

# Normalizar las columnas seleccionadas
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

# Verificar el resultado
print(df.head())

ValueError: could not convert string to float: 'PCSSystemtec_45:5c:08'

We save the final dataset
============================

In [None]:
df.to_csv('clean_dataset.csv', index=False)

we split our dataset into training and validation data
=========================================================

In [None]:
train_df, val_df = train_test_split(
  df,
  test_size=0.15,
  random_state=RANDOM_SEED
)

val_df, test_df = train_test_split(
  df,
  test_size=0.33, 
  random_state=RANDOM_SEED
)

In [None]:
def create_dataset(df):

  sequences = df.astype(np.float32).to_numpy().tolist()

  dataset = [torch.tensor(s).unsqueeze(1).float() for s in sequences]

  n_features = torch.stack(dataset).shape[2]
  seq_len=25

  return dataset, seq_len, n_features

In [None]:
train_dataset, seq_len, n_features = create_dataset(train_df)
val_dataset, _, _ = create_dataset(val_df)
test_normal_dataset, _, _ = create_dataset(test_df)
#test_anomaly_dataset, _, _ = create_dataset(anomaly_df)

NameError: name 'train_df' is not defined

We create the class of the Autoencoder with an Lstm
======================================================

First the encoder

In [None]:
class Encoder(nn.Module):

  def __init__(self, seq_len, n_features, embedding_dim=64):
    super(Encoder, self).__init__()

    self.seq_len, self.n_features = seq_len, n_features
    self.embedding_dim, self.hidden_dim = embedding_dim, 2 * embedding_dim

    self.rnn1 = nn.LSTM(
      input_size=n_features,
      hidden_size=self.hidden_dim,
      num_layers=1,
      batch_first=True
    )
    
    self.rnn2 = nn.LSTM(
      input_size=self.hidden_dim,
      hidden_size=embedding_dim,
      num_layers=1,
      batch_first=True
    )

  def forward(self, x):
    x = x.reshape((1, self.seq_len, self.n_features))

    x, (_, _) = self.rnn1(x)
    x, (hidden_n, _) = self.rnn2(x)

    return hidden_n.reshape((self.n_features, self.embedding_dim))

Now the decoder

In [None]:
class Decoder(nn.Module):

  def __init__(self, seq_len, input_dim=64, n_features=1):
    super(Decoder, self).__init__()

    self.seq_len, self.input_dim = seq_len, input_dim
    self.hidden_dim, self.n_features = 2 * input_dim, n_features

    self.rnn1 = nn.LSTM(
      input_size=input_dim,
      hidden_size=input_dim,
      num_layers=1,
      batch_first=True
    )

    self.rnn2 = nn.LSTM(
      input_size=input_dim,
      hidden_size=self.hidden_dim,
      num_layers=1,
      batch_first=True
    )

    self.output_layer = nn.Linear(self.hidden_dim, n_features)

  def forward(self, x):
    x = x.repeat(self.seq_len, self.n_features)
    x = x.reshape((self.n_features, self.seq_len, self.input_dim))

    x, (hidden_n, cell_n) = self.rnn1(x)
    x, (hidden_n, cell_n) = self.rnn2(x)
    x = x.reshape((self.seq_len, self.hidden_dim))

    return self.output_layer(x)

And finally the Encoder-Decoder
================================

In [None]:
class RecurrentAutoencoder(nn.Module):

  def __init__(self, seq_len, n_features, embedding_dim=14):
    super(RecurrentAutoencoder, self).__init__()

    self.encoder = Encoder(seq_len, n_features, embedding_dim).to(device)
    self.decoder = Decoder(seq_len, embedding_dim, n_features).to(device)

  def forward(self, x):
    x = self.encoder(x)
    x = self.decoder(x)

    return x

In [None]:
model = RecurrentAutoencoder(seq_len, n_features)
model = model.to(device)

Training
===========
We train our model to minimize the loss function at the validation stagee

In [None]:
def train_model(model, train_dataset, val_dataset, n_epochs):
  optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
  criterion = nn.L1Loss(reduction='sum').to(device)
  history = dict(train=[], val=[])

  best_model_wts = copy.deepcopy(model.state_dict())
  best_loss = 10000.0
  
  for epoch in range(1, n_epochs + 1):
    model = model.train()

    train_losses = []
    for seq_true in train_dataset:
      optimizer.zero_grad()

      seq_true = seq_true.to(device)
      seq_pred = model(seq_true)

      loss = criterion(seq_pred, seq_true)

      loss.backward()
      optimizer.step()

      train_losses.append(loss.item())

    val_losses = []
    model = model.eval()
    with torch.no_grad():
      for seq_true in val_dataset:

        seq_true = seq_true.to(device)
        seq_pred = model(seq_true)

        loss = criterion(seq_pred, seq_true)
        val_losses.append(loss.item())

    train_loss = np.mean(train_losses)
    val_loss = np.mean(val_losses)

    history['train'].append(train_loss)
    history['val'].append(val_loss)

    if val_loss < best_loss:
      best_loss = val_loss
      best_model_wts = copy.deepcopy(model.state_dict())

    print(f'Epoch {epoch}: train loss {train_loss} val loss {val_loss}')

  model.load_state_dict(best_model_wts)
  return model.eval(), history

In [None]:
model, history = train_model(
  model, 
  train_dataset, 
  val_dataset, 
  n_epochs=15
)

Epoch 1: train loss 73706423.82356195 val loss 74206237.67134832
Epoch 2: train loss 73706389.64767699 val loss 74206209.70119382
Epoch 3: train loss 73706363.18611726 val loss 74206182.99964887
Epoch 4: train loss 73706338.0721792 val loss 74206157.52844101
Epoch 5: train loss 73706314.51659292 val loss 74206132.2082163
Epoch 6: train loss 73706291.22400442 val loss 74206111.15660113
Epoch 7: train loss 73706266.55945796 val loss 74206089.45189607
Epoch 8: train loss 73706243.46266593 val loss 74206065.00842696
Epoch 9: train loss 73706220.61144912 val loss 74206040.85147472
Epoch 10: train loss 73706198.35702434 val loss 74206016.62886237
Epoch 11: train loss 73706177.53207965 val loss 74205999.86200842
Epoch 12: train loss 73706159.20353982 val loss 74205985.18820225
Epoch 13: train loss 73706144.49363938 val loss 74205968.25983146
Epoch 14: train loss 73706131.20271018 val loss 74205956.47155899
Epoch 15: train loss 73706118.69800885 val loss 74205945.33602528


In [None]:
MODEL_PATH = 'model.pth'

torch.save(model, MODEL_PATH)