In [157]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score, precision_score, confusion_matrix
import numpy as np
import pandas as pd
from utils import specificity_score, negative_prediction_value_score, gmean_score, informedness_score
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Data Loading

In [158]:
# Load the data
df = pd.read_csv('./data/HTRU_2.csv', header=None)
df.columns = ['IpMean', 'IpDev', 'IpKurt','IpSkew', 'DMMean', 'DMDev', 'DMKurt', 'DMSkew', 'Class']

# Split the data into features and target
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values


# Define the number of experiments
n_runs = 100

# Create a ShuffleSplit instance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# Display the df
df.head()

Unnamed: 0,IpMean,IpDev,IpKurt,IpSkew,DMMean,DMDev,DMKurt,DMSkew,Class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


# Model definition

In [159]:
class SimpleNN(nn.Module):
    def __init__(self,input_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.input_layer = nn.Linear(input_dim,128)
        self.hidden_layer1  = nn.Linear(128,64)
        self.output_layer   = nn.Linear(64,output_dim)
        self.relu = nn.ReLU()

    def forward(self,x):
        out =  self.relu(self.input_layer(x))
        out =  self.relu(self.hidden_layer1(out))
        out =  self.output_layer(out)
        return out

class ClassicCNN(nn.Module):
    def __init__(self, num_features, output_dim):
        super(ClassicCNN, self).__init__()
        self.conv1d = nn.Conv1d(in_channels=num_features, out_channels=128, kernel_size=1)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(128, 64)
        self.fc2 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = x.view(-1, 8, 1)
        x = self.conv1d(x)
        x = self.relu(x)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x


input_dim  = 8 # number of features
output_dim = 2 # binary classification, to be change for sigmoid ?


In [160]:
scaler = StandardScaler() # TBD if we need it

#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)


# Convert to PyTorch tensors
X_train = torch.FloatTensor(X_train)
X_test = torch.FloatTensor(X_test)
y_train = torch.LongTensor(y_train)
y_test = torch.LongTensor(y_test)


# Define the number of experiments
n_runs = 50

# Initialize lists to store the scores
scores = {
    'accuracy': [],
    'balanced_accuracy': [],
    'recall': [],
    'specificity': [],
    'precision': [],
    'npv': [],
    'gmean': [],
    'informedness': []
}

# Training CNN

In [161]:


ss = ShuffleSplit(n_splits=n_runs, train_size=60, test_size=120)

for train_index, test_index in ss.split(X):
    def train_network(model,optimizer,criterion,X_train,y_train,X_test,y_test,num_epochs,train_losses,test_losses):
        for epoch in range(num_epochs):
            #clear out the gradients from the last step loss.backward()
            optimizer.zero_grad()
            
            #forward feed
            output_train = model(X_train)

            #calculate the loss
            loss_train = criterion(output_train, y_train)
            
            #backward propagation: calculate gradients
            loss_train.backward()

            #update the weights
            optimizer.step()
            
            output_test = model(X_test)
            loss_test = criterion(output_test,y_test)

            train_losses[epoch] = loss_train.item()
            test_losses[epoch] = loss_test.item()

            # accuracy
            _, predicted_train = torch.max(output_train, 1)
            _, predicted_test = torch.max(output_test, 1)

    num_epochs = 50
    train_losses = np.zeros(num_epochs)
    test_losses  = np.zeros(num_epochs)
    model = ClassicCNN(input_dim, output_dim)

    learning_rate = 0.01
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

    train_network(model,optimizer,criterion,X_train,y_train,X_test,y_test,num_epochs,train_losses,test_losses)

    # Predict the test set
    output_test = model(X_test)
    _, predicted_test = torch.max(output_test, 1)

    # Calculate the scores
    scores['accuracy'].append(accuracy_score(y_test, predicted_test))
    scores['balanced_accuracy'].append(balanced_accuracy_score(y_test, predicted_test))
    scores['recall'].append(recall_score(y_test, predicted_test))
    scores['specificity'].append(specificity_score(y_test, predicted_test))
    scores['precision'].append(precision_score(y_test, predicted_test))
    scores['npv'].append(negative_prediction_value_score(y_test, predicted_test))
    scores['gmean'].append(gmean_score(y_test, predicted_test))
    scores['informedness'].append(informedness_score(y_test, predicted_test))

# Prediction

In [162]:
# Print the scores
for metric, values in scores.items():
    mean_value = np.mean(values)
    std_value = np.std(values)
    print(f"{metric.capitalize()}: {mean_value:.3f} ± {std_value:.3f}")

Accuracy: 0.971 ± 0.007
Balanced_accuracy: 0.884 ± 0.023
Recall: 0.779 ± 0.051
Specificity: 0.990 ± 0.010
Precision: 0.893 ± 0.071
Npv: 0.978 ± 0.005
Gmean: 0.877 ± 0.027
Informedness: 0.768 ± 0.046
