In [1]:
#torch and lightning (deep learning/model creation and training)
import torch
from torch import nn
import lightning as L
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint

#ray (tune/grid search)
from ray import tune
from ray.tune.schedulers import ASHAScheduler

#matplotlib and seaborn (plotting)
import matplotlib.pyplot as plt
import seaborn as sns

#pyOD (outlier detection)
from pyod.models.iforest import IForest

#mlflow (loggind/tracking)
import mlflow

#general purpose
import numpy as np
import pandas as pd
import os

#split data into train and test sets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold


In [2]:
df = pd.read_parquet('data.parquet')
df.head(4)

Unnamed: 0,X86_LOCAL_APIC,OPENVSWITCH,TEXTSEARCH_FSM,NETFILTER_XT_MATCH_TCPMSS,MPLS,NFC_HCI,NETFILTER_XT_MATCH_TIME,NET_MPLS_GSO,NFC_SHDLC,NETFILTER_XT_MATCH_U32,...,ARCH_SUPPORTS_INT128,SLABINFO,MICROCODE_AMD,ISDN_DRV_HISAX,CHARGER_BQ24190,SND_SOC_NAU8825,BH1750,NETWORK_FILESYSTEMS,active_options,perf
0,1,0,0,0,1,0,0,1,0,0,...,1,0,0,0,1,0,0,0,1435,50222120
1,1,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,1382,16660024
2,1,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,1626,43080856
3,1,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,0,1,0,2140,27261672


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92562 entries, 0 to 92561
Columns: 9469 entries, X86_LOCAL_APIC to perf
dtypes: int64(9469)
memory usage: 6.5 GB


# Float point transformation

+ Pass the data to float
+ Get used on pytorch models

In [4]:
df = df.astype('float32')

In [5]:
df.describe()

Unnamed: 0,X86_LOCAL_APIC,OPENVSWITCH,TEXTSEARCH_FSM,NETFILTER_XT_MATCH_TCPMSS,MPLS,NFC_HCI,NETFILTER_XT_MATCH_TIME,NET_MPLS_GSO,NFC_SHDLC,NETFILTER_XT_MATCH_U32,...,ARCH_SUPPORTS_INT128,SLABINFO,MICROCODE_AMD,ISDN_DRV_HISAX,CHARGER_BQ24190,SND_SOC_NAU8825,BH1750,NETWORK_FILESYSTEMS,active_options,perf
count,92562.0,92562.0,92562.0,92562.0,92562.0,92562.0,92562.0,92562.0,92562.0,92562.0,...,92562.0,92562.0,92562.0,92562.0,92562.0,92562.0,92562.0,92562.0,92562.0,92562.0
mean,0.99987,0.109635,0.083879,0.026663,0.325123,0.124111,0.026285,0.207612,0.105789,0.027355,...,0.999665,0.410049,0.225784,0.026717,0.372982,0.068246,0.268868,0.244182,1841.919922,49633680.0
std,0.011385,0.312504,0.277109,0.161146,0.46822,0.329876,0.160043,0.40553,0.307585,0.163083,...,0.018295,0.491674,0.418009,0.161297,0.483427,0.252278,0.443332,0.429588,583.99408,70737850.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,216.0,1168072.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1368.0,22617240.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1739.0,31444000.0
75%,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,2265.0,47771450.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,9294.0,1780630000.0


In [14]:
df.sample(5000)

Unnamed: 0,X86_LOCAL_APIC,OPENVSWITCH,TEXTSEARCH_FSM,NETFILTER_XT_MATCH_TCPMSS,MPLS,NFC_HCI,NETFILTER_XT_MATCH_TIME,NET_MPLS_GSO,NFC_SHDLC,NETFILTER_XT_MATCH_U32,...,ARCH_SUPPORTS_INT128,SLABINFO,MICROCODE_AMD,ISDN_DRV_HISAX,CHARGER_BQ24190,SND_SOC_NAU8825,BH1750,NETWORK_FILESYSTEMS,active_options,perf
52227,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1342.0,22997264.0
35731,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1771.0,20908824.0
13976,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2187.0,313029056.0
78378,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1800.0,31283936.0
55813,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,2392.0,41966824.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81179,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1699.0,33552336.0
72777,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,2551.0,32914312.0
57968,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1664.0,27830312.0
78728,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,2733.0,34316792.0


# sample and split the dataset

In [6]:
target_column = 'perf'

In [7]:
y = df[target_column]
X = df.drop(columns=[target_column])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [13]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)


NameError: name 'torch' is not defined

# Create a pytorch data loader

In [None]:
class CustomDataset(torch.Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# Create the models, using pytorch lightningmodule

In [None]:
number_of_features = X_train.shape[1]

In [None]:
model_sequence_high_dropout = torch.nn.Sequential(nn.Linear(number_of_features, number_of_features/2),
                                     nn.ReLU(),
                                     nn.Dropout(0.8),
                                     nn.Linear(number_of_features/2, number_of_features/2),
                                     nn.ReLU(),
                                     nn.Dropout(0.5),
                                     nn.Linear(number_of_features/2, 1))

model_sequence_low_dropout = torch.nn.Sequential(nn.Linear(number_of_features, number_of_features/2),
                                     nn.ReLU(),
                                     nn.Dropout(0.3),
                                     nn.Linear(number_of_features/2, number_of_features/2),
                                     nn.ReLU(),
                                     nn.Dropout(0.2),
                                     nn.Linear(number_of_features/2, 1))

In [None]:
config = {
    "optimizer": tune.choice(["Adam", "AdamW"]),
    "loss": tune.choice(["MAPE", "MSE","SmoothL1Loss"])
}

In [None]:
class LightningTransformer(L.LightningModule):
    def __init__(self, model, optimizer_name="Adam", loss_name="MSELoss"):
        super(self).__init__()
        self.model = model
        self.optimizer_name = optimizer_name
        self.loss_name = loss_name
    
    def training_step(self, batch, batch_idx):
        x, _ = batch
        x = x.view(x.size(0), -1)
        z = self.model(x)
        loss = nn.functional.mse_loss(z, x)
        return loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer
    
    def get_optimizer(self):
        optimizers = {
            "Adam": torch.optim.Adam(self.parameters(), lr=0.001),
            "SGD": torch.optim.SGD(self.parameters(), lr=0.001),
            "RMSprop": torch.optim.RMSprop(self.parameters(), lr=0.001)
        }
        return optimizers[self.optimizer_name]

    def get_loss_function(self):
        loss_functions = {
            "MSELoss": nn.MSELoss(),
            "L1Loss": nn.L1Loss(),
            "CrossEntropyLoss": nn.CrossEntropyLoss()
        }
        return loss_functions[self.loss_name]