# Neuronales Netz

In [624]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

import math
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

import matplotlib.pyplot as plt

In [625]:
numOfFeaturesAtStart = 5

In [626]:
numOfFeaturesAddedToSections = 1

In [None]:
class SimplifiedModel(nn.Module):
    def __init__(self, n_layers = 3, hiddenLayerNumNodes = 64):
        super(SimplifiedModel, self).__init__()
        
        self.numLayers = n_layers

        self.fcIn = nn.Linear(numOfFeaturesAtStart + 12 * (19 + numOfFeaturesAddedToSections), 64)
        self.fcStart = nn.Linear(64, hiddenLayerNumNodes)
        self.fcInner = nn.Linear(hiddenLayerNumNodes,hiddenLayerNumNodes)
        self.fcEnd = nn.Linear(hiddenLayerNumNodes, 16)
        self.fc_out = nn.Linear(16, 1)     
        
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fcIn(x))
        x = self.relu(self.fcStart(x))

        for i in range(self.numLayers):
            x = self.relu(self.fcInner(x))

        x = self.relu(self.fcEnd(x))
        x = self.sigmoid(self.fc_out(x))        
        
        return x

In [628]:
from torchsummary import summary

summaryModel = SimplifiedModel().to(torch.device('cuda'))
summary(summaryModel, input_size=(1, numOfFeaturesAtStart + 12 * (19 + numOfFeaturesAddedToSections)))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                [-1, 1, 64]          15,744
              ReLU-2                [-1, 1, 64]               0
            Linear-3                [-1, 1, 64]           4,160
              ReLU-4                [-1, 1, 64]               0
            Linear-5                [-1, 1, 64]           4,160
              ReLU-6                [-1, 1, 64]               0
            Linear-7                [-1, 1, 64]           4,160
              ReLU-8                [-1, 1, 64]               0
            Linear-9                [-1, 1, 64]           4,160
             ReLU-10                [-1, 1, 64]               0
          Dropout-11                [-1, 1, 64]               0
           Linear-12                [-1, 1, 16]           1,040
             ReLU-13                [-1, 1, 16]               0
           Linear-14                 [-

In [629]:
df_data = pd.read_json('datasets/data.json', orient='records')
df_control = pd.read_json('datasets/control.json', orient='records')
df = pd.concat([df_data, df_control], ignore_index=True)

In [630]:
print(len(df_data))

1700


In [631]:
print(len(df_control))

1726


In [632]:
X = df['sections'].values
numSections = df['numSections'].values
fullEntropy = df['fullEntropy'].values
minEntropy = df['minEntropy'].values
maxEntropy = df['maxEntropy'].values
X_entropyList = df['entropyList'].values
y = df['label'].values
y = [label for label in y] * 2

In [633]:
X = X.tolist()
numSections = numSections.tolist()
fullEntropy = fullEntropy.tolist()
minEntropy = minEntropy.tolist()
maxEntropy = maxEntropy.tolist()

In [634]:
averageChunkEntropy = [0] * len(X_entropyList)
for i in range(len(X_entropyList)):
    averageChunkEntropy[i] = sum(X_entropyList[i]) / len(X_entropyList[i])

In [635]:
chunkEntropyVariance = [0] * len(X_entropyList)
for i in range(len(X_entropyList)):
    chunkEntropyVariance[i] = np.var(X_entropyList[i])

In [636]:
fileLength = [0] * len(X_entropyList)
for i in range(len(X_entropyList)):
    fileLength[i] = 2048 * len(X_entropyList)

In [637]:
numSectionsAboveCertainEntropy = [0] * len(X)
for i in range(len(X)):
    numSectionsAboveCertainEntropy[i] = sum([1 for item in X[i] if item[-1] > 7.5])

In [638]:
for i in range(len(X)):
        for entry in X[i]:
                entry[-2] /= 1e+8
                entry[-1] /= 8
                entry.append(entry[-1] / (fullEntropy[i] + 1e-8 / 8))
        # X[i].insert(0,numSections[i] / 12)
        X[i].insert(0,chunkEntropyVariance[i] / 8)
        X[i].insert(0,averageChunkEntropy[i] / 8)
        X[i].insert(0,fullEntropy[i] / 8)
        X[i].insert(0,minEntropy[i] / 8)
        X[i].insert(0,maxEntropy[i] / 8)
        # X[i].insert(0,numSectionsAboveCertainEntropy[i] / 12)
        # X[i].insert(0, fileLength[i] / 1e+8)

In [640]:
import random
X2 = X
entropy_range = 0.5 / 8
byte_range = 250 / 1e+8
for i in range(len(X2)):
    for e, entry in enumerate(X2[i]):
        if e <= numOfFeaturesAtStart - 1:
            continue
        entry[-3] += random.uniform(-byte_range, byte_range)
        entry[-2] += random.uniform(-entropy_range, entropy_range)
        entry[-1] += random.uniform(-entropy_range, entropy_range)
    for e in range(numOfFeaturesAtStart):
        if e == -1:
            continue
        if e == -1:
            X2[i][e] += random.uniform(-byte_range, byte_range)
            continue
        X2[i][e] += random.uniform(-entropy_range, entropy_range)
    # X2[i][4] += random.uniform(-entropy_range, entropy_range)

In [642]:
X += X2

In [644]:
for i in range(len(X)):
    X[i] = X[i][0:numOfFeaturesAtStart] + [item for sublist in X[i][numOfFeaturesAtStart:] for item in sublist] ## X[i][0:5] + 

In [646]:
batch_size = 64

In [647]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)

X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [648]:
def generateNewSplit():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32)

    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

# Training the model

In [649]:
withOptuna = False
testMore = True

In [650]:
def calculate_recall(y_true, y_pred):
    true_positives = (y_true * y_pred).sum()
    false_negatives = (y_true * (1 - y_pred)).sum()
    
    recall = true_positives / (true_positives + false_negatives + 1e-8)
    return recall.item()

In [651]:
def plot_loss_accuracy(losses, accuracies, val_losses, val_accuracies, recalls, val_recalls):
    epochs = range(1, len(losses) + 1)

    plt.figure(figsize=(15, 5))

    plt.subplot(1, 3, 1)
    plt.plot(epochs, losses, label='Loss', color='slategrey')
    plt.plot(epochs, val_losses, label='Validation Loss', color='cornflowerblue')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.grid(True)

    plt.subplot(1, 3, 2)
    plt.plot(epochs, accuracies, label='Accuracy', color='seagreen')
    plt.plot(epochs, val_accuracies, label='Validation Accuracy', color='lime')
    plt.title('Training Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.grid(True)

    plt.subplot(1, 3, 3)
    plt.plot(epochs, recalls, label='Recall', color='seagreen')
    plt.plot(epochs, val_recalls, label='Validation Recall', color='lime')
    plt.title('Training Recall')
    plt.xlabel('Epoch')
    plt.ylabel('Recall')
    plt.grid(True)

    plt.tight_layout()
    plt.show()

In [652]:
def train(model, criterion, optimizer, scheduler, device, num_epochs=10, trial=None):
    
    all_losses = []
    all_accuracies = []
    all_val_losses = []
    all_val_accuracies = []
    all_recalls = []
    all_val_recalls = []

    for epoch in range(num_epochs):

        fold_results = []

        for fold, (train_index, val_index) in enumerate(kf.split(X_train_tensor)):

            X_train, X_val = X_train_tensor[train_index], X_train_tensor[val_index]
            y_train, y_val = y_train_tensor[train_index], y_train_tensor[val_index]

            train_dataset = TensorDataset(X_train, y_train)
            val_dataset = TensorDataset(X_val, y_val)
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

            model.train()
            
            running_loss = 0.0
            correct_predictions = 0
            total_samples = 0
            fold_recall = 0.0

            for inputs, targets in train_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                optimizer.zero_grad()

                outputs = model(inputs)

                loss = criterion(outputs.squeeze(), targets.float())
                loss.backward()
                optimizer.step()

                running_loss += loss.item()

                predicted = (outputs.squeeze() > 0.5).float()
                correct_predictions += (predicted == targets).sum().item()
                total_samples += targets.size(0)

                fold_recall += calculate_recall(targets, predicted)

            scheduler.step()

            model.eval()

            val_loss = 0
            correct_val_predictions = 0
            fold_val_recall = 0
            total_val_samples = 0
            with torch.no_grad():
                for inputs, targets in val_loader:
                    inputs, targets = inputs.to(device), targets.to(device)
                    outputs = model(inputs)

                    loss = criterion(outputs.squeeze(), targets.float())
                    val_loss += loss.item()

                    predicted = (outputs.squeeze() > 0.5).float()
                    correct_val_predictions += (predicted == targets).sum().item()
                    total_val_samples += targets.size(0)

                    fold_val_recall += calculate_recall(targets, predicted)


            fold_loss = running_loss / len(train_loader)
            fold_accuracy = correct_predictions / total_samples
            fold_recall /= len(train_loader)
            fold_val_loss = val_loss / len(val_loader)
            fold_val_accuracy = correct_val_predictions / total_val_samples
            fold_val_recall /= len(val_loader)

            fold_combined_accuracy = (correct_predictions + correct_val_predictions) / (total_samples + total_val_samples)

            fold_results.append((fold_loss, fold_accuracy, fold_recall, fold_val_loss, fold_val_accuracy, fold_val_recall, fold_combined_accuracy))

        all_losses.append(np.mean([result[0] for result in fold_results]))
        all_accuracies.append(np.mean([result[1] for result in fold_results]))
        all_recalls.append(np.mean([result[2] for result in fold_results]))
        all_val_losses.append(np.mean([result[3] for result in fold_results]))
        all_val_accuracies.append(np.mean([result[4] for result in fold_results]))
        all_val_recalls.append(np.mean([result[5] for result in fold_results]))

        if withOptuna and trial:
            trial.report(np.mean([result[6] for result in fold_results]), epoch)
            
            if trial.should_prune():
                # raise optuna.TrialPruned()
                pass

        if not withOptuna:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {fold_loss:.4f}, Recall: {fold_recall*100:.2f}%, Accuracy: {fold_accuracy*100:.2f}%, Val Loss: {fold_val_loss:.4f}, Val Recall: {fold_val_recall*100:.2f}%, Val Accuracy: {fold_val_accuracy*100:.2f}%, Recall Spread: {(fold_recall-fold_val_recall) * 100:.2f}', end='\r')

    if not withOptuna and not testMore:
        plot_loss_accuracy(all_losses, all_accuracies, all_val_losses, all_val_accuracies, all_recalls, all_val_recalls)

In [653]:
if not withOptuna and not testMore:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = SimplifiedModel(2, 64).to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.002, weight_decay=5e-6)
    scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)
    train(model, criterion, optimizer, scheduler, device, 50)

In [654]:
def test(model, test_loader, criterion, device, printResult=True):
    model.eval() 
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0
    recall = 0.0

    with torch.no_grad(): 
        for inputs, targets in test_loader:

            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            
            loss = criterion(outputs.squeeze(), targets.float())
            total_loss += loss.item()
            
            predicted = (outputs.squeeze() > 0.5).float() 
            correct_predictions += (predicted == targets).sum().item()
            total_samples += targets.size(0)

            recall += calculate_recall(targets, predicted)

    average_loss = total_loss / len(test_loader)
    accuracy = (correct_predictions / total_samples)
    recall /= len(test_loader)

    if printResult:
        print(f'Test Loss: {average_loss:.4f}, Test Recall: {recall*100:.2f}%, Test Accuracy: {accuracy*100:.2f}%', end='\r\n')

    return average_loss, accuracy, recall

In [655]:
if not withOptuna and not testMore:
    test(model, test_loader, criterion, device)

In [656]:
""" if testMore and not withOptuna:

    resultsAccuracy = []
    resultsRecall = []

    for i in range(10):

        print("Trial " + str(i + 1), end='\n')

        generateNewSplit()

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = SimplifiedModel(4, 128).to(device)
        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.002, weight_decay=3.7e-6)
        scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.978)
        train(model, criterion, optimizer, scheduler, device, 50)

        results = test(model, test_loader, criterion, device)
        resultsAccuracy.append(results[1])
        resultsRecall.append(results[2])

    resultsDataFrame = pd.DataFrame({'Accuracy': resultsAccuracy, 'Recall': resultsRecall})
    resultsDataFrame.to_json("./result.json")

    print("Accuracy:", f"{sum(resultsAccuracy) / 10 * 100:.2f}%", "Recall:", f"{sum(resultsRecall) / 10 * 100:.2f}%") """

' if testMore and not withOptuna:\n\n    resultsAccuracy = []\n    resultsRecall = []\n\n    for i in range(10):\n\n        print("Trial " + str(i + 1), end=\'\n\')\n\n        generateNewSplit()\n\n        device = torch.device(\'cuda\' if torch.cuda.is_available() else \'cpu\')\n        model = SimplifiedModel(4, 128).to(device)\n        criterion = nn.BCELoss()\n        optimizer = optim.Adam(model.parameters(), lr=0.002, weight_decay=3.7e-6)\n        scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.978)\n        train(model, criterion, optimizer, scheduler, device, 50)\n\n        results = test(model, test_loader, criterion, device)\n        resultsAccuracy.append(results[1])\n        resultsRecall.append(results[2])\n\n    resultsDataFrame = pd.DataFrame({\'Accuracy\': resultsAccuracy, \'Recall\': resultsRecall})\n    resultsDataFrame.to_json("./result.json")\n\n    print("Accuracy:", f"{sum(resultsAccuracy) / 10 * 100:.2f}%", "Recall:", f"{sum(resultsRecall) / 10 * 

In [657]:
if testMore and not withOptuna:

    resultsAccuracy = []
    resultsRecall = []

    for i in range(50):

        print("Trial " + str(i + 1), end='\n')

        generateNewSplit()

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = SimplifiedModel(3, 64).to(device)
        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.00225, weight_decay=3.22e-5)
        scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9915)
        train(model, criterion, optimizer, scheduler, device, 50)

        results = test(model, test_loader, criterion, device)
        resultsAccuracy.append(results[1])
        resultsRecall.append(results[2])

    resultsDataFrame = pd.DataFrame({'Accuracy': resultsAccuracy, 'Recall': resultsRecall})
    resultsDataFrame.to_json("./result.json")

    print("Accuracy:", f"{sum(resultsAccuracy) / 10 * 100:.2f}%", "Recall:", f"{sum(resultsRecall) / 10 * 100:.2f}%")

Trial 1
Test Loss: 0.5491, Test Recall: 98.01%, Test Accuracy: 97.96%0%, Val Loss: 0.0001, Val Recall: 100.00%, Val Accuracy: 100.00%, Recall Spread: 0.00
Trial 2
Test Loss: 0.3147, Test Recall: 98.42%, Test Accuracy: 98.10%0%, Val Loss: 0.0001, Val Recall: 100.00%, Val Accuracy: 100.00%, Recall Spread: 0.00
Trial 3
Test Loss: 0.4078, Test Recall: 98.64%, Test Accuracy: 98.69%, Val Loss: 0.0002, Val Recall: 100.00%, Val Accuracy: 100.00%, Recall Spread: -0.05
Trial 4
Test Loss: 0.2800, Test Recall: 97.65%, Test Accuracy: 97.88%, Val Loss: 0.0019, Val Recall: 100.00%, Val Accuracy: 100.00%, Recall Spread: -0.16
Trial 5
Test Loss: 0.2808, Test Recall: 98.37%, Test Accuracy: 98.54%0%, Val Loss: 0.0004, Val Recall: 100.00%, Val Accuracy: 100.00%, Recall Spread: 0.00
Trial 6
Test Loss: 0.3791, Test Recall: 97.63%, Test Accuracy: 98.25%, Val Loss: 0.0033, Val Recall: 100.00%, Val Accuracy: 100.00%, Recall Spread: -0.38
Trial 7
Test Loss: 0.4650, Test Recall: 97.28%, Test Accuracy: 96.64%, Va

# Optuna Hyperparameter Optimization

In [None]:
import optuna
import optunahub

def objective(trial):

    generateNewSplit()
    
    lr = trial.suggest_float('learning_rate', 0.001, 0.003)
    nHiddenLayers = trial.suggest_int('n_layers', 1, 6)
    weightDecay = trial.suggest_float('weigth_decay', 0, 1e-4)
    gamma = trial.suggest_float('gamma', 0.975, 1)
    numNodes = trial.suggest_int('n_nodes', 32, 128, step=16)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = SimplifiedModel(nHiddenLayers, numNodes).to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weightDecay)
    scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=gamma)

    train(model, criterion, optimizer, scheduler, device, 40, trial)
    result = test(model, test_loader, criterion, device)
    accuracies = result[1]
    trial.set_user_attr("recall", result[2])

    return accuracies ## , sum_recalls / num_of_cycles

In [None]:
if withOptuna:
    name = "final Model"

    if name in optuna.get_all_study_names(storage="sqlite:///db1.sqlite3"):
        study = optuna.load_study(study_name=name, storage="sqlite:///db1.sqlite3")
    else: 
        study = optuna.create_study(direction='maximize', storage="sqlite:///db1.sqlite3", study_name=name, sampler=optuna.samplers.TPESampler(n_startup_trials=5))

    study.optimize(objective, n_trials=150)

# Save model

In [660]:
# torch.save(model, 'entire_model.pth')

In [661]:
# For loading:
# model = torch.load('entire_model.pth')
# model.eval()