 ----
Splitting the snd-unm test data into normal test data and anomaly test data
----

In [8]:
normal = open("snd-unm/normal.txt",'a')
anomaly = open("snd-unm/anomaly.txt",'a')

for i in range(3):
    unm = open("snd-unm/snd-unm.{}.test".format(i+1),'r')
    unm_labels = open("snd-unm/snd-unm.{}.labels".format(i+1),'r')
    for line in unm:
        label = unm_labels.readline()
        if int(label) == 0:
            normal.write(line)
        else:
            anomaly.write(line)
    unm.close()
    unm_labels.close()
    
normal.close()
anomaly.close()  

# Finding shortest string

In [9]:
normal = open("snd-unm/normal.txt",'r')
anomaly = open("snd-unm/anomaly.txt",'r')
train = open("snd-unm/snd-unm.train","r")

min_training = min(train.read().splitlines(), key = len)
min_normal = min(normal.read().splitlines(), key = len)
min_anomaly = min(anomaly.read().splitlines(), key = len)
min_overall = min(min_training,min_normal,min_anomaly, key = len)

min_length = len(min_overall)
print("Length of shortest string:", min_length)

train.close()
normal.close()
anomaly.close()  

Length of shortest string: 7


----------
Creating chunks of fixed length from file
----------

In [10]:
def split_into_chunks(input_file,output_file,chunk_size, chunklist):
    input_file = open(input_file,'r')
    output_file = open(output_file,'a')
    for line in input_file:
        size = len(line)-1 # -1 because of \n
        start = 0
        substrings = []
        number_of_chunks = 0 # for merging the counts later
        while size >= chunk_size:
            chunk = line[start:start+chunk_size]
            if chunk not in substrings:
                output_file.write(chunk+"\n")
                substrings.append(chunk)
                number_of_chunks += 1
            start += chunk_size
            size -= chunk_size
        chunklist.append(number_of_chunks)
    input_file.close()
    output_file.close()

In [11]:
train = "snd-unm/snd-unm.train"
normal = "snd-unm/normal.txt"
anomaly = "snd-unm/anomaly.txt"

fixed_train = "snd-unm/fixed_train.txt"
fixed_normal = "snd-unm/fixed_normal.txt"
fixed_anomaly = "snd-unm/fixed_anomaly.txt"

In [12]:
train_chunks = []
normal_chunks = []
anomaly_chunks = []

split_into_chunks(train,fixed_train,min_length,train_chunks)
split_into_chunks(normal,fixed_normal,min_length,normal_chunks)
split_into_chunks(anomaly,fixed_anomaly, min_length,anomaly_chunks)

# Training the repertoire and testing against normal and anomalous data

In [13]:
import subprocess

working_dir = r'' # the absolute path to the negative_selection folder between the quotes should be added
encoding = 'utf-8'

## Testing normal calls and saving results

In [14]:
bashCommand = '''java -jar negsel2.jar -alphabet syscalls/snd-unm/snd-unm.alpha 
                -self syscalls/snd-unm/fixed_train.txt -n 7 -r 5 -c -l 
                < syscalls/snd-unm/fixed_normal.txt'''


process = subprocess.check_output(bashCommand.split(), shell = True, cwd = working_dir)
normalOutput = process.decode(encoding)

## Testing anomalous calls and saving results

In [15]:
bashCommand = '''java -jar negsel2.jar -alphabet syscalls/snd-unm/snd-unm.alpha 
                -self syscalls/snd-unm/fixed_train.txt -n 7 -r 5 -c -l
                < syscalls/snd-unm/fixed_anomaly.txt'''

process = subprocess.check_output(bashCommand.split(), shell = True, cwd = working_dir)
anomalyOutput = process.decode(encoding)

# Collecting results for computation

In [16]:
import numpy as np

anomaly_preds = np.array(anomalyOutput.splitlines(), dtype = np.float64)
normal_preds = np.array(normalOutput.splitlines(), dtype = np.float64)

# Computing final scores by merging counts and averaging them

In [17]:
anomaly_scores = []
normal_scores = []

for i in range(np.size(anomaly_chunks)):
    start = int(np.sum(anomaly_chunks[:i]))
    finish = int(start + anomaly_chunks[i])
    pred_mean = np.mean(anomaly_preds[start:finish])
    anomaly_scores.append(pred_mean)

for i in range(np.size(normal_chunks)):
    start = int(np.sum(normal_chunks[:i]))
    finish = int(start + normal_chunks[i])
    pred_mean = np.mean(normal_preds[start:finish])
    normal_scores.append(pred_mean) 
    
scores = normal_scores + anomaly_scores

# Calculating AUC score

In [18]:
normalLabels = np.zeros((len(normal_scores),), dtype = np.int8)
anomalyLabels = np.ones((len(anomaly_scores),), dtype = np.int8)
targets = np.concatenate((normalLabels,anomalyLabels))

In [19]:
from sklearn.metrics import roc_auc_score
print("The AUC score:",roc_auc_score(targets,scores))

The AUC score: 0.8936107526881719


----
Doing the same process for the snd-cert files
----

In [20]:
normal = open("snd-cert/normal.txt",'a')
anomaly = open("snd-cert/anomaly.txt",'a')

for i in range(3):
    cert = open("snd-cert/snd-cert.{}.test".format(i+1),'r')
    cert_labels = open("snd-cert/snd-cert.{}.labels".format(i+1),'r')
    for line in cert:
        label = cert_labels.readline()
        if int(label) == 0:
            normal.write(line)
        else:
            anomaly.write(line)
    cert.close()
    cert_labels.close()
    
normal.close()
anomaly.close()  

In [21]:
train = open("snd-cert/snd-cert.train","r")
normal = open("snd-cert/normal.txt",'r')
anomaly = open("snd-cert/anomaly.txt",'r')


min_training = min(train.read().splitlines(), key = len)
min_normal = min(normal.read().splitlines(), key = len)
min_anomaly = min(anomaly.read().splitlines(), key = len)
min_overall = min(min_training,min_normal,min_anomaly, key = len)

min_length = len(min_overall)
print("Length of the shortest string:", min_length)

train.close()
normal.close()
anomaly.close()  

Length of the shortest string: 7


In [22]:
train = "snd-cert/snd-cert.train"
normal = "snd-cert/normal.txt"
anomaly = "snd-cert/anomaly.txt"

fixed_train = "snd-cert/fixed_train.txt"
fixed_normal = "snd-cert/fixed_normal.txt"
fixed_anomaly = "snd-cert/fixed_anomaly.txt"

In [23]:
train_chunks = []
normal_chunks = []
anomaly_chunks = []

split_into_chunks(train,fixed_train,min_length,train_chunks)
split_into_chunks(normal,fixed_normal,min_length,normal_chunks)
split_into_chunks(anomaly,fixed_anomaly, min_length,anomaly_chunks)

In [24]:
bashCommand = '''java -jar negsel2.jar -alphabet syscalls/snd-cert/snd-cert.alpha 
                -self syscalls/snd-cert/fixed_train.txt -n 7 -r 5 -c -l 
                < syscalls/snd-cert/fixed_normal.txt'''

process = subprocess.check_output(bashCommand.split(), shell = True, cwd = working_dir)
normalOutput = process.decode(encoding)

In [25]:
bashCommand = '''java -jar negsel2.jar -alphabet syscalls/snd-cert/snd-cert.alpha 
                -self syscalls/snd-cert/fixed_train.txt -n 7 -r 5 -c -l
                < syscalls/snd-cert/fixed_anomaly.txt'''

process = subprocess.check_output(bashCommand.split(), shell = True, cwd = working_dir)
anomalyOutput = process.decode(encoding)

In [26]:
anomaly_preds = np.array(anomalyOutput.splitlines(), dtype = np.float64)
normal_preds = np.array(normalOutput.splitlines(), dtype = np.float64)

In [27]:
anomaly_scores = []
normal_scores = []

for i in range(np.size(anomaly_chunks)):
    start = int(np.sum(anomaly_chunks[:i]))
    finish = int(start + anomaly_chunks[i])
    pred_mean = np.mean(anomaly_preds[start:finish])
    anomaly_scores.append(pred_mean)

for i in range(np.size(normal_chunks)):
    start = int(np.sum(normal_chunks[:i]))
    finish = int(start + normal_chunks[i])
    pred_mean = np.mean(normal_preds[start:finish])
    normal_scores.append(pred_mean) 
    
scores = normal_scores + anomaly_scores

In [28]:
normalLabels = np.zeros((len(normal_scores),), dtype = np.int8)
anomalyLabels = np.ones((len(anomaly_scores),), dtype = np.int8)
targets = np.concatenate((normalLabels,anomalyLabels))

In [29]:
print("The AUC score:",roc_auc_score(targets,scores))

The AUC score: 0.8210537634408602
