In [4]:
import torch
from torch.utils.data import IterableDataset
import pandas as pd
import numpy as np
import glob
from typing import List
import json
from tqdm import tqdm
import matplotlib.pylab as plt
import random


In [5]:
class Intrusion_model(torch.nn.Module):
    def __init__(self, num_classes: int) -> None:
        super().__init__()
        self.conv1 = torch.nn.Conv1d(in_channels= 1, out_channels= 32, kernel_size= 3, groups= 1)
        self.batch_norm1 = torch.nn.BatchNorm1d(num_features= 32)
        self.drop_out1 = torch.nn.Dropout1d()
        
        self.conv2 = torch.nn.Conv1d(in_channels= 32, out_channels= 64, kernel_size= 3, groups= 1)
        self.batch_norm2 = torch.nn.BatchNorm1d(num_features= 64)
        self.avgpl2 = torch.nn.AvgPool1d(kernel_size= 3)
        self.drop_out2 = torch.nn.Dropout1d()
        
        
        self.conv3 = torch.nn.Conv1d(in_channels= 64, out_channels= 128, kernel_size= 3)
        self.batch_norm3 = torch.nn.BatchNorm1d(num_features= 128)
        self.drop_out3 = torch.nn.Dropout1d()

        self.flatten = torch.nn.Flatten()
        self.linear = torch.nn.Linear(in_features= 1536,out_features= num_classes)
        
    def forward(self, x):
        
        x = self.conv1(x)
        x = self.batch_norm1(x)
        x = self.drop_out1(x)
        
        x = self.conv2(x)
        x = self.batch_norm2(x)
        x = self.avgpl2(x)
        x = self.drop_out2(x)
        
        x = self.conv3(x)
        x = self.batch_norm3(x)
        x = self.drop_out3(x)
        
        x = self.flatten(x)
        x = self.linear(x)
        
        return x

class MyDataset(IterableDataset):
    """Create dataset adopts to read multiple 
    files and also be able to load batch"""
    def __init__(self, 
                 list_dirs: List[str],
                 label2index: dict, 
                 num_cols: int, 
                 batch_size:int
                 ) -> None:
        super().__init__()
        self.list_dirs = list_dirs
        self.label2index = label2index
        self.num_cols = num_cols
        self.batch_size = batch_size
        
    def mapping_normalized(self,column: pd.Series)-> pd.Series:
        max_value = column.max()
        min_value = column.min()
        return (column - min_value)/(max_value - min_value)
    
    def __iter__(self):
        for file_path in self.list_dirs:
            file_data = pd.read_csv(file_path)
            
            # apply normalized along columns, x dtype pd.series
            file_data = file_data.apply(lambda x: self.mapping_normalized(x) if x.name != "label" and \
                                        x.max()!=x.min() else x, axis= 0) 
            
            labels =[self.label2index[str_label] for str_label in file_data["label"].tolist()]
            features = file_data.iloc[:, :self.num_cols-1].values.astype(np.float16)         
            assert  np.any(np.isfinite(features)), f"contains inf, {features}"
            
            h, _ = features.shape
            
            # split into batchs
            for batch_idx in range(0, h, self.batch_size):
                batch_features = features[batch_idx : batch_idx+self.batch_size, :]
                batch_labels = labels[batch_idx : batch_idx+1]
                    
                yield (torch.tensor(batch_features, dtype= torch.float32), 
                       torch.tensor(batch_labels, dtype= torch.long).squeeze())
    

In [6]:
if __name__ == "__main__":
    labels2indices = None
    with open("labels2index.json") as data: 
        labels2indices = json.load(data)

    model = Intrusion_model(num_classes= len(labels2indices.keys()))
    model.train()
    
    list_dirs = [path for path in glob.glob("cic_iot_data/*.csv")]
    
    # train_size = int(len(list_dirs)*0.7)
    train_ids = [0,2,4,1]
    val_ids = [i for i in range(0, len(list_dirs))if i not in train_ids]
    print("train ids: ", train_ids)
    print("val ids: ", val_ids)
    train_list_dirs = [list_dirs[i] for i in train_ids]
    val_list_dirs = [list_dirs[i] for i in val_ids]
    
    _, w = pd.read_csv(list_dirs[0]).values.shape
    print("Num columns: ",w)

    train_dataset = MyDataset(list_dirs = train_list_dirs, 
                        label2index = labels2indices, 
                        num_cols = w, 
                        batch_size = 1)
    
    val_dataset = MyDataset(list_dirs = val_list_dirs, 
                        label2index = labels2indices, 
                        num_cols = w, 
                        batch_size = 1)
        
    train_loader = torch.utils.data.DataLoader(train_dataset, 
                                               batch_size = 8,
                                               num_workers = 0)
    val_loader = torch.utils.data.DataLoader(val_dataset, 
                                             batch_size = 8,
                                             num_workers = 0)
    
    
    optimizer = torch.optim.SGD(params= model.parameters(), lr = 0.01)
    loss_fn = torch.nn.CrossEntropyLoss()
    
    total_train_loss = []
    total_val_loss = []
    for epoch in range(5):
        model.train()
        loss_in_batch = 0.0
        for ith, (features, labels) in tqdm(enumerate(train_loader)):
            optimizer.zero_grad()
            outputs = model(features)
            loss = loss_fn(outputs, labels)

            loss.backward()
            optimizer.step()
            loss_in_batch += loss.item()
        
        loss_in_batch = loss_in_batch/(ith+1)
        total_train_loss.append(loss_in_batch)

        model.eval()
        
        val_loss_in_batch = 0.0
        for val_ith, (val_features, val_labels) in tqdm(enumerate(val_loader)):
            val_outputs = model(val_features)
            val_loss = loss_fn(val_outputs, val_labels)
            val_loss_in_batch += val_loss.item()
        
        val_loss_in_batch = val_loss_in_batch/(val_ith+1)
        total_val_loss.append(val_loss_in_batch)

        print(f"Epoch: {epoch}, train loss: {loss_in_batch}, val loss {val_loss_in_batch}\n")
        
    plt.plot(total_train_loss, color = "red")
    plt.plot(total_val_loss, color = "green")
    plt.legend()
    plt.savefig("train_val_losses.png")

Num columns:  47


87617it [05:21, 272.19it/s]


TypeError: object of type 'MyDataset' has no len()