Time series classification of anomaly detections
====================================================

In [48]:
import torch

import copy
import numpy as np
import pandas as pd

from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split

from torch import nn, optim

import torch.nn.functional as F
from sklearn.preprocessing import OneHotEncoder



In [49]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x1930948ba10>

Data Mining
==============

In [50]:
data = pd.read_csv('test2.csv')
df = pd.DataFrame(data)
print(df.head())

   No.      Time      Source Destination Protocol  Length  \
0  135  1.183043  10.10.10.9     9.9.9.9      DNS      89   
1  138  1.192994     9.9.9.9  10.10.10.9      DNS     224   
2  429  2.984566  10.10.10.9     9.9.9.9      DNS      84   
3  430  2.995903     9.9.9.9  10.10.10.9      DNS     231   
4  601  3.632943  10.10.10.9     9.9.9.9      DNS      77   

                                                Info  
0  Standard query 0xe181 A nav.smartscreen.micros...  
1  Standard query response 0xe181 A nav.smartscre...  
2   Standard query 0x0ac6 A afdxtest.z01.azurefd.net  
3  Standard query response 0x0ac6 A afdxtest.z01....  
4          Standard query 0x2151 A k-ring.msedge.net  


In [51]:
df['Source'] = df['Source'].str.replace('.', '')
df['Destination'] = df['Destination'].str.replace('.', '')

# Mostrar las primeras filas del DataFrame con la columna 'Source' modificada
print(df.head())

   No.      Time   Source Destination Protocol  Length  \
0  135  1.183043  1010109        9999      DNS      89   
1  138  1.192994     9999     1010109      DNS     224   
2  429  2.984566  1010109        9999      DNS      84   
3  430  2.995903     9999     1010109      DNS     231   
4  601  3.632943  1010109        9999      DNS      77   

                                                Info  
0  Standard query 0xe181 A nav.smartscreen.micros...  
1  Standard query response 0xe181 A nav.smartscre...  
2   Standard query 0x0ac6 A afdxtest.z01.azurefd.net  
3  Standard query response 0x0ac6 A afdxtest.z01....  
4          Standard query 0x2151 A k-ring.msedge.net  


  df['Source'] = df['Source'].str.replace('.', '')
  df['Destination'] = df['Destination'].str.replace('.', '')


In [52]:
protocol_mapping = {
    'TCP': 1,
    'UDP': 2,
    'HTTP': 3,
    'HTTPS': 4,
    'FTP': 5,
    'SMTP': 6,
    'POP3': 7,
    'IMAP': 8,
    'SNMP': 9,
    'DHCP': 10,
    'DNS': 11,
    'ICMP':12
}

df['Protocol'] = df['Protocol'].replace(protocol_mapping)
df = df.drop(columns=['Info','No.'])

# Mostrar las primeras filas del DataFrame con la columna 'Protocol' modificada
print(df.head())

       Time   Source Destination  Length
0  1.183043  1010109        9999      89
1  1.192994     9999     1010109     224
2  2.984566  1010109        9999      84
3  2.995903     9999     1010109     231
4  3.632943  1010109        9999      77


we split our dataset into training and validation data
=========================================================

In [53]:
train_df, val_df = train_test_split(
  df,
  test_size=0.15,
  random_state=RANDOM_SEED
)

val_df, test_df = train_test_split(
  df,
  test_size=0.33, 
  random_state=RANDOM_SEED
)

In [54]:
def create_dataset(df):

  sequences = df.astype(np.float32).to_numpy().tolist()

  dataset = [torch.tensor(s).unsqueeze(1).float() for s in sequences]

  n_seq, seq_len, n_features = torch.stack(dataset).shape

  return dataset, seq_len, n_features

In [55]:
train_dataset, seq_len, n_features = create_dataset(train_df)
val_dataset, _, _ = create_dataset(val_df)
test_normal_dataset, _, _ = create_dataset(test_df)
#test_anomaly_dataset, _, _ = create_dataset(anomaly_df)

We create the class of the Autoencoder with an Lstm
======================================================

First the encoder

In [56]:
class Encoder(nn.Module):

  def __init__(self, seq_len, n_features, embedding_dim=64):
    super(Encoder, self).__init__()

    self.seq_len, self.n_features = seq_len, n_features
    self.embedding_dim, self.hidden_dim = embedding_dim, 2 * embedding_dim

    self.rnn1 = nn.LSTM(
      input_size=n_features,
      hidden_size=self.hidden_dim,
      num_layers=1,
      batch_first=True
    )
    
    self.rnn2 = nn.LSTM(
      input_size=self.hidden_dim,
      hidden_size=embedding_dim,
      num_layers=1,
      batch_first=True
    )

  def forward(self, x):
    x = x.reshape((1, self.seq_len, self.n_features))

    x, (_, _) = self.rnn1(x)
    x, (hidden_n, _) = self.rnn2(x)

    return hidden_n.reshape((self.n_features, self.embedding_dim))

Now the decoder

In [57]:
class Decoder(nn.Module):

  def __init__(self, seq_len, input_dim=64, n_features=1):
    super(Decoder, self).__init__()

    self.seq_len, self.input_dim = seq_len, input_dim
    self.hidden_dim, self.n_features = 2 * input_dim, n_features

    self.rnn1 = nn.LSTM(
      input_size=input_dim,
      hidden_size=input_dim,
      num_layers=1,
      batch_first=True
    )

    self.rnn2 = nn.LSTM(
      input_size=input_dim,
      hidden_size=self.hidden_dim,
      num_layers=1,
      batch_first=True
    )

    self.output_layer = nn.Linear(self.hidden_dim, n_features)

  def forward(self, x):
    x = x.repeat(self.seq_len, self.n_features)
    x = x.reshape((self.n_features, self.seq_len, self.input_dim))

    x, (hidden_n, cell_n) = self.rnn1(x)
    x, (hidden_n, cell_n) = self.rnn2(x)
    x = x.reshape((self.seq_len, self.hidden_dim))

    return self.output_layer(x)

And finally the Encoder-Decoder
================================

In [58]:
class RecurrentAutoencoder(nn.Module):

  def __init__(self, seq_len, n_features, embedding_dim=14):
    super(RecurrentAutoencoder, self).__init__()

    self.encoder = Encoder(seq_len, n_features, embedding_dim).to(device)
    self.decoder = Decoder(seq_len, embedding_dim, n_features).to(device)

  def forward(self, x):
    x = self.encoder(x)
    x = self.decoder(x)

    return x

In [59]:
model = RecurrentAutoencoder(seq_len, n_features)
model = model.to(device)

Training
===========
We train our model to minimize the loss function at the validation stagee

In [60]:
def train_model(model, train_dataset, val_dataset, n_epochs):
  optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
  criterion = nn.L1Loss(reduction='sum').to(device)
  history = dict(train=[], val=[])

  best_model_wts = copy.deepcopy(model.state_dict())
  best_loss = 10000.0
  
  for epoch in range(1, n_epochs + 1):
    model = model.train()

    train_losses = []
    for seq_true in train_dataset:
      optimizer.zero_grad()

      seq_true = seq_true.to(device)
      seq_pred = model(seq_true)

      loss = criterion(seq_pred, seq_true)

      loss.backward()
      optimizer.step()

      train_losses.append(loss.item())

    val_losses = []
    model = model.eval()
    with torch.no_grad():
      for seq_true in val_dataset:

        seq_true = seq_true.to(device)
        seq_pred = model(seq_true)

        loss = criterion(seq_pred, seq_true)
        val_losses.append(loss.item())

    train_loss = np.mean(train_losses)
    val_loss = np.mean(val_losses)

    history['train'].append(train_loss)
    history['val'].append(val_loss)

    if val_loss < best_loss:
      best_loss = val_loss
      best_model_wts = copy.deepcopy(model.state_dict())

    print(f'Epoch {epoch}: train loss {train_loss} val loss {val_loss}')

  model.load_state_dict(best_model_wts)
  return model.eval(), history

In [61]:
model, history = train_model(
  model, 
  train_dataset, 
  val_dataset, 
  n_epochs=15
)

Epoch 1: train loss 73706423.82356195 val loss 74206237.67134832
Epoch 2: train loss 73706389.64767699 val loss 74206209.70119382
Epoch 3: train loss 73706363.18611726 val loss 74206182.99964887
Epoch 4: train loss 73706338.0721792 val loss 74206157.52844101
Epoch 5: train loss 73706314.51659292 val loss 74206132.2082163
Epoch 6: train loss 73706291.22400442 val loss 74206111.15660113
Epoch 7: train loss 73706266.55945796 val loss 74206089.45189607
Epoch 8: train loss 73706243.46266593 val loss 74206065.00842696
Epoch 9: train loss 73706220.61144912 val loss 74206040.85147472
Epoch 10: train loss 73706198.35702434 val loss 74206016.62886237
Epoch 11: train loss 73706177.53207965 val loss 74205999.86200842
Epoch 12: train loss 73706159.20353982 val loss 74205985.18820225
Epoch 13: train loss 73706144.49363938 val loss 74205968.25983146
Epoch 14: train loss 73706131.20271018 val loss 74205956.47155899
Epoch 15: train loss 73706118.69800885 val loss 74205945.33602528
