In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pickle
import time
import math
import pandas as pd
import torch
import torch.optim as optim

import matplotlib.pyplot as plt
import pyarrow as pa
import pyarrow.parquet as pq
from sklearn.model_selection import train_test_split
import tensorflow_addons as tfa

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#print type of GPU 
print("Using: ", torch.cuda.get_device_name(device))

Using:  NVIDIA GeForce RTX 3050 Ti Laptop GPU


# Prepare data

In [3]:
# load parquet file 
phishing = pq.read_table('../data/floor/phishing.parquet')
benign = pq.read_table('../data/floor/benign.parquet')

from loader.transformers.cast_timestamp import cast_timestamp
from loader.transformers.drop_nontrain import drop_nontrain

phishing = drop_nontrain(phishing)
benign = drop_nontrain(benign)

# realign schemas (parquet files save in nonsense orders)
benign = benign.cast(phishing.schema)

# concatentate tables
data = pa.concat_tables([phishing, benign])
df = data.to_pandas()

df = cast_timestamp(df)
# create train and test sets
    
class_map = {"benign:unknown": 0, "misp:phishing": 1}

labels = df['label'].apply(lambda x: class_map[x]) # y vector
features = df.drop('label', axis=1).copy() # X matrix

X_train, X_test, y_train, y_test = train_test_split(
features,
labels,
    test_size=0.2,
    random_state=42,
    shuffle=True,
    stratify=labels
)

# fill nans with 0 in X_train and X_test and y_train and y_test
    
x_train = X_train.fillna(0)
x_test = X_test.fillna(0)
    
y_train = y_train.fillna(0)
y_test = y_test.fillna(0)
    
# convert x_train to numpy array
x_train = x_train.to_numpy()
y_train = y_train.to_numpy()
    
y_test = y_test.to_numpy()
x_test = x_test.to_numpy()
    
    # Converting False and True to 0 and 1
x_train = np.where(x_train == False, 0, x_train)
x_train = np.where(x_train == True, 1, x_train)
    
x_test = np.where(x_test == False, 0, x_test)
x_test = np.where(x_test == True, 1, x_test)
    

    
 # transform all datapoints using sigmoid function
    
for item in x_test:
    for i in range(len(item)):
        item[i] = 1/(1+math.exp(-item[i]))
            
for item in x_train:
    for i in range(len(item)):
        item[i] = 1/(1+math.exp(-item[i]))
            
print("Sigmoid scaling done")
print("Data ready")
        
        
        
        


Sigmoid scaling done
Data ready


# Network definition

In [4]:
'''		
Class: Net
pytorch definition of neural network structure

'''       
class Net(nn.Module):

    # Network structure definition
    def __init__(self):         
        super().__init__()
        self.fc1 = nn.Linear(81, 2500)
        self.fc2 = nn.Linear(2500, 600)
        self.fc3 = nn.Linear(600, 200)
        self.fc4 = nn.Linear(200, 1)


    # Data flow definition
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return torch.sigmoid(self.fc4(x)) # For binary classification is sigmoid best

# NN train and test 


In [5]:
def nn_train(x_train, y_train):
    print("Training model...")
    counter = 1
    sum = 0
    batch_sum = 0


    checkpoint_position = 8000
    f1_checkpoint = 5000
    
    # calculation of f1 by hand my god...
    tp_res = 0
    tn_res = 0
    
    fp_res = 0
    fn_res = 0
    
    


    for i in range(0, len(x_train)):
        
        # convert x_train[i] to list
        input_data = torch.tensor(x_train[i].tolist(), device=device).float()
        target_data = torch.tensor(y_train[i].tolist(), device=device).float()
        
        

        # start real time timer
        timer = time.time()
        
   #
        
        
        # feed the data to the network
        output=net(input_data)
        
        
        if counter % checkpoint_position == 0:
            time_per_cycle = (time.time() - timer)*1000/checkpoint_position
            
            
            
            # using counter, checkopint_position and checkopint_counter calculate time for one iteration is ms
            # time per cycle in ms
            

            

            if counter > f1_checkpoint + checkpoint_position:
                #print("tp:", tp_res, "tn:", tn_res, "fp:", fp_res, "fn:", fn_res)
                try:
                    precision = tp_res/(tp_res+fp_res)
                    recall = tp_res/(tp_res+fn_res)
                    
                    #print("precision:", round(precision, 3), "recall:", round(recall, 3))
                    
                    f1 = 2*((precision*recall)/(precision+recall))
                    
                    #print("F1:", round(f1, 4), "True positive:", tp_res)
                
                except:
                    pass
                    #print('Zero in denominator')
                    #print("tp:", tp_res, "tn:", tn_res, "fp:", fp_res, "fn:", fn_res)
                    
                    
            precision = tp_res/(tp_res+fp_res)
            recall = tp_res/(tp_res+fn_res)   
            f1 = 2*((precision*recall)/(precision+recall))    
            print("Loss:", round(sum/checkpoint_position, 3), "F1:", round(f1, 4),"Progress:",  round((counter/len(x_train))*100, 3), "%", "Time per cycle:", round(time_per_cycle, 5), "ms")


            sum=0
        counter+=1
        

        # convert expected results to torch format

        
        
        # increment tp, tn, fp, fn
        if output >= 0.5 and target_data == 1:
            tp_res+=1
        elif output < 0.5 and target_data == 0:
            tn_res+=1
        elif output >= 0.5 and target_data == 0:
            fp_res+=1
        elif output < 0.5 and target_data == 1:
            fn_res+=1
        


        
        loss = loss_fn(output, torch.FloatTensor([target_data]).to(device))
        
        sum+=float(loss)
        batch_sum+=float(loss)
        
        loss.backward()
        optimizer.step()
        
    print("--------------------------------------------------")
    print("Batch loss:", float(batch_sum)/float(len(x_train)))
    print("--------------------------------------------------")
            

def test(x_test, y_test):
    
    tp_res = 0
    tn_res = 0
    
    fp_res = 0
    fn_res = 0
    

    
    
    print("Benchmarking model...")
    for i in range(0, len(x_test)):
        #print(x_test[i].tolist())
        
        #print start of the progress bar    
        print("Progress:", round((i/len(x_test))*100, 3), "%", end="\r")

        input_data = torch.tensor(x_test[i].tolist(), device=device).float()
        target_data = torch.tensor(y_test[i].tolist(), device=device).float()
        
        
        output=net(input_data)
        
        
        
        if output >= 0.5 and target_data == 1:
            tp_res+=1
        elif output < 0.5 and target_data == 0:
            tn_res+=1
        elif output >= 0.5 and target_data == 0:
            fp_res+=1
        elif output < 0.5 and target_data == 1:
            fn_res+=1
    
    #compute F1
    recall = tp_res/(tp_res+fn_res)
    precision = tp_res/(tp_res+fp_res)
    f1 = 2*((precision*recall)/(precision+recall))
    
    print("f1: ", f1)
    
    # Nicely print confusion matrix
    data = {'Actual Positive': [tp_res, fn_res],
        'Actual Negative': [fp_res, tn_res]}

    df = pd.DataFrame(data, index = ['Predicted Positive', 'Predicted Negative'])

    print(df)

# Network parameters and setup

- Loading pre-trained model from file 
- Network constants

In [6]:
use_saved_model = True
save_after_batch = True
model_path = "./models/2500phish_net"


### Netowrk parameters ###
learning_rate = 0.000001
epoch_count = 25

net = Net()

if use_saved_model: 
    net = pickle.load(open(model_path, 'rb'))
    

# send model to GPU
net = net.to(device)

optimizer = optim.Adam(net.parameters(), lr=learning_rate)

In [None]:
optimizer = optim.Adam(net.parameters(), lr=learning_rate)
loss_fn = nn.BCELoss()
optimizer.zero_grad()

for i in range(epoch_count):
    nn_train(x_array, y_train)
    print("Batch number:", i)
        
    if save_after_batch:
        # save model using pickle
        print("Saving model after batch")
        pickle.dump(net, open("./models/2500phish_net", 'wb'))

In [None]:


test(x_test, y_test)


Benchmarking model...
f1:  0.8276667353527029
                    Actual Positive  Actual Negative
Predicted Positive             6025             1136
Predicted Negative             1373            50537


NameError: name 'optim' is not defined

In [None]:
# TEST NN
test(x_test, y_test)