In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils import read_data
from gensim.models import FastText
from copy import deepcopy
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    from imblearn.over_sampling import SMOTE

## Helper functions

In [2]:
def make_clf(usx, usy, clf, clf_name, level):
    """
    Function for the classification task - Trains and tests the classifier clf using 10-fold cross-validation
    The sampling parameter sets the type of sampling to be used
    :param usx: the input instances
    :param usy: the labels of the instances
    :param clf: the classifier to be used
    :param clf_name: the name of the classifier (for plotting reasons)
    :param level: the evaluation level (for plotting reasons)
    :return: the classification results
    """
    print('---------- Hold-out with {} at {} level ----------'.format(clf_name, level))
    
    x_train, x_test, y_train, y_test = train_test_split(usx, usy, test_size=0.3,
                                                        stratify=usy,random_state=42)
    
    totalTP, totalFP, totalFN, totalTN = 0, 0, 0, 0
    # apply SMOTE for imbalance issues
    x_train, y_train = SMOTE(sampling_strategy='auto', k_neighbors=1).fit_resample(x_train, y_train)

    # fit the model and make predictions
    clf.fit(x_train, y_train)
    y_predict = clf.predict(x_test)

    for i in range(len(y_predict)):
        if y_test[i] and y_predict[i]:
            totalTP += 1
        if not y_test[i] and y_predict[i]:
            totalFP += 1
        if y_test[i] and not y_predict[i]:
            totalFN += 1
        if not y_test[i] and not y_predict[i]:
            totalTN += 1

    # just in case that no TP or FP are found
    if not (totalTP + totalFP):
        precision = 1
    else:
        precision = totalTP / (totalTP + totalFP)
    recall = totalTP / (totalTP + totalFN)
    accuracy = (totalTP + totalTN) / (totalTP + totalFN + totalTN + totalFP)
    print('TOTAL TP: ' + str(totalTP))
    print('TOTAL FP: ' + str(totalFP))
    print('TOTAL FN: ' + str(totalFN))
    print('TOTAL TN: ' + str(totalTN))
    print('TOTAL Accuracy: ' + str(accuracy))
    print('TOTAL Precision: ' + str(precision))
    print('TOTAL Recall: ' + str(recall))

In [3]:
def make_clf_cross(usx, usy, clf, clf_name, level):
    """
    Function for the classification task - Trains and tests the classifier clf using 10-fold cross-validation
    The sampling parameter sets the type of sampling to be used
    :param usx: the input instances
    :param usy: the labels of the instances
    :param clf: the classifier to be used
    :param clf_name: the name of the classifier (for plotting reasons)
    :param level: the evaluation level (for plotting reasons)
    :return: the classification results
    """
    print('---------- Cross-validation with {} at {} level ----------'.format(clf_name, level))
    totalTP, totalFP, totalFN, totalTN = 0, 0, 0, 0
    j = 0
    skf = StratifiedKFold(n_splits=3, shuffle=True)  # apply 10-fold stratified cross validation
    for train_index, test_index in skf.split(usx, usy):

        # split data in training and test set
        x_train, x_test = usx[train_index], usx[test_index]
        y_train, y_test = usy[train_index], usy[test_index]
        # apply SMOTE for imbalance issues
        x_train, y_train = SMOTE(sampling_strategy='auto', k_neighbors=1).fit_resample(x_train, y_train)

        # create_clusters(x_train, y_train, train_ips)  # TODO: not fully implemented yet - decisions still to be made

        # fit the model and make predictions
        clf.fit(x_train, y_train)
        y_predict = clf.predict(x_test)

        for i in range(len(y_predict)):
            if y_test[i] and y_predict[i]:
                totalTP += 1
            if not y_test[i] and y_predict[i]:
                totalFP += 1
            if y_test[i] and not y_predict[i]:
                totalFN += 1
            if not y_test[i] and not y_predict[i]:
                totalTN += 1
        j += 1

    # just in case that no TP or FP are found
    if not (totalTP + totalFP):
        precision = 1
    else:
        precision = totalTP / (totalTP + totalFP)
    recall = totalTP / (totalTP + totalFN)
    accuracy = (totalTP + totalTN) / (totalTP + totalFN + totalTN + totalFP)
    print('TOTAL TP: ' + str(totalTP))
    print('TOTAL FP: ' + str(totalFP))
    print('TOTAL FN: ' + str(totalFN))
    print('TOTAL TN: ' + str(totalTN))
    print('TOTAL Accuracy: ' + str(accuracy))
    print('TOTAL Precision: ' + str(precision))
    print('TOTAL Recall: ' + str(recall))

In [4]:
def check_infected(val, infected_ips):
    """
    Function to assign labels to instances given their ip
    :param val: the flow to be checked
    :param infected_ips: the list of infected hosts
    :return: 1 if val is in the infected list otherwise 0
    """
    return 1 if val in infected_ips else 0

In [5]:
def embedding_host_level(z):
    x = []
    for i,flow in enumerate(z.iterrows()):
        x.append(list(model.wv[z.iloc[i]].sum(axis=0)))
    x = np.array(x)
    return x.mean(axis=0)

In [6]:
def discretize_protocol(item):
    return list(protocols).index(item)

## Read data

In [7]:
infected_host = '147.32.84.165'
data_52 = read_data('datasets/CTU-Malware-Capture-Botnet-52')
data_52.to_pickle('./data_52.pkl')
# data_52 = pd.read_pickle('./data_52.pkl')

data_51 = read_data('datasets/CTU-Malware-Capture-Botnet-51')
data_51.to_pickle('./data_51.pkl')
# data_51 = pd.read_pickle('./data_51.pkl')

In [8]:
# Used to keep the original data
data_original_52 = deepcopy(data_52)

data_original_51 = deepcopy(data_51)

# Discretize data


In [9]:
# discretize protocol
protocols_52 = set(data_52.protocol.unique())
data_52['protocol'] = data_52['protocol'].apply(lambda x: list(protocols_52).index(x))


protocols_51 = set(data_51.protocol.unique())
data_51['protocol'] = data_51['protocol'].apply(lambda x: list(protocols_51).index(x))

In [10]:
# discretize flags

flags_52 = set(data_52.flags.unique())
data_52['flags'] = data_52['flags'].apply(lambda x: list(flags_52).index(x))

flags_51 = set(data_51.flags.unique())
data_51['flags'] = data_51['flags'].apply(lambda x: list(flags_51).index(x))


In [11]:
# discretize label

data_52['label'] = data_52['label'].replace('Botnet', 0)
data_52['label'] = data_52['label'].replace('LEGITIMATE', 1)

data_51['label'] = data_51['label'].replace('Botnet', 0)
data_51['label'] = data_51['label'].replace('LEGITIMATE', 1)

In [12]:
# remove background flows and date

data_52 = data_52[data_52['label'] != 'Background']
data_52.drop(['date'],1, inplace=True)

data_51 = data_51[data_51['label'] != 'Background']
data_51.drop(['date'],1, inplace=True)

In [13]:
# replace 'na' values in  src and dst port

data_52['src_port'] = data_52['src_port'].replace('na', 0)
data_52['dst_port'] = data_52['dst_port'].replace('na', 0)

data_51['src_port'] = data_51['src_port'].replace('na', 0)
data_51['dst_port'] = data_51['dst_port'].replace('na', 0)

In [14]:
data_52.reset_index(inplace=True)
data_52.head()

Unnamed: 0,index,duration,protocol,src_ip,src_port,dst_ip,dst_port,flags,tos,packets,bytes,flows,label
0,21,0.0,9,147.32.86.194,3972,173.194.1.27,80,76,0,2,108,1,1
1,22,0.015,9,147.32.86.194,3974,74.125.108.185,80,76,0,2,108,1,1
2,35,0.0,9,147.32.84.28,56781,147.32.80.13,80,49,0,2,132,1,1
3,36,0.117,9,147.32.84.164,57264,64.12.68.227,443,23,0,2,114,1,1
4,37,0.117,9,147.32.84.164,43869,64.12.73.134,443,23,0,2,114,1,1


In [15]:
data_51.reset_index(inplace=True)
data_51.head()

Unnamed: 0,index,duration,protocol,src_ip,src_port,dst_ip,dst_port,flags,tos,packets,bytes,flows,label
0,20,4.985,13,147.32.80.13,80,147.32.85.88,56949,90,0,91,86277,1,1
1,51,0.0,13,147.32.86.110,48102,74.125.232.214,443,83,0,1,66,1,1
2,57,4.921,13,147.32.85.88,56949,147.32.80.13,80,83,0,49,3234,1,1
3,73,4.742,13,147.32.84.59,2768,74.125.108.208,80,83,0,118,7080,1,1
4,75,0.0,13,147.32.84.59,56058,74.125.232.215,443,83,0,1,60,1,1


## Classification without embeddings on dataset 51

In [16]:
# simple classifcation

# set the classifiers
clf_name = 'RandomForestClassifier'
clf = RandomForestClassifier(n_estimators=50, criterion='gini', class_weight='balanced')

infected_ips = ['147.32.84.165', '147.32.84.191', '147.32.84.192', '147.32.84.193',
                '147.32.84.204', '147.32.84.205', '147.32.84.206', '147.32.84.207',
                '147.32.84.208', '147.32.84.209']

# enter the classification phase for each level
eval_levels = ['packet', 'host']  # the 2 evaluation levels
# eval_levels = ['host']  # the 2 evaluation levels


for level in eval_levels:
    # prepare the data according to the level

    if level == 'host':
        new_data_51 = data_51.groupby('src_ip').mean().reset_index()
#         new_data = data.groupby('src_ip').sum().reset_index()

    else:
        new_data_51 = deepcopy(data_51)
        
    # label the processed dataset(s)
    new_data_51['label'] = new_data_51['src_ip'].apply(lambda z: check_infected(z, infected_ips))

    # separate the labels from the rest of the dataset
    y = new_data_51['label'].values
    if 'dst_ip' in new_data_51:
        x = new_data_51.drop(['src_ip','dst_ip', 'label'], axis=1).values
    else:
        x = new_data_51.drop(['src_ip', 'label'], axis=1).values

    # enter the classification phase
    print('Start the classification process')
    usx = np.copy(x)
    usy = np.copy(y)
    make_clf_cross(usx, usy, clf, clf_name, level)
    make_clf(usx, usy, clf, clf_name, level)

Start the classification process
---------- Cross-validation with RandomForestClassifier at packet level ----------
TOTAL TP: 218751
TOTAL FP: 13
TOTAL FN: 24
TOTAL TN: 426570
TOTAL Accuracy: 0.9999426674806852
TOTAL Precision: 0.9999405752317566
TOTAL Recall: 0.9998902982516283
---------- Hold-out with RandomForestClassifier at packet level ----------
TOTAL TP: 65628
TOTAL FP: 6
TOTAL FN: 5
TOTAL TN: 127969
TOTAL Accuracy: 0.9999431841659435
TOTAL Precision: 0.9999085839656275
TOTAL Recall: 0.9999238188106593
Start the classification process
---------- Cross-validation with RandomForestClassifier at host level ----------
TOTAL TP: 9
TOTAL FP: 0
TOTAL FN: 1
TOTAL TN: 512
TOTAL Accuracy: 0.9980842911877394
TOTAL Precision: 1.0
TOTAL Recall: 0.9
---------- Hold-out with RandomForestClassifier at host level ----------
TOTAL TP: 3
TOTAL FP: 0
TOTAL FN: 0
TOTAL TN: 154
TOTAL Accuracy: 1.0
TOTAL Precision: 1.0
TOTAL Recall: 1.0


## Classification with embeddings

### Train FASTTEXT model

In [17]:
# train model

df = data_52.apply(lambda x: ','.join(x.astype(str)), axis=1)
sent = [row.split(',') for row in df]
model = FastText(sent, min_count=1, size=50, workers=2, window=3, sg=1)

# save model
if not os.path.exists('models'):
    os.makedirs('models')
fname = 'models/fasttext.model'
model.save(fname)

# load trained model
# fname = 'models/fasttext.model'
# model = FastText.load(fname)

### Build the train data for packet level

In [18]:
# Build the train data for packet level

packet_x = []
infected_ips = ['147.32.84.165', '147.32.84.191', '147.32.84.192', '147.32.84.193',
                '147.32.84.204', '147.32.84.205', '147.32.84.206', '147.32.84.207',
                '147.32.84.208', '147.32.84.209']

data_51['label'] = data_51['src_ip'].apply(lambda z: check_infected(z, infected_ips)).values
dataset = data_51.drop('label', 1)

packet_y = data_51['label'].values
for i,flow in enumerate(dataset.iterrows()):
    packet_x.append(list(model.wv[dataset.iloc[i]].sum(axis=0)))

packet_x = np.array(packet_x)
packet_y = np.array(packet_y, dtype=int)

### Build the train data for host level


In [19]:

infected_ips = ['147.32.84.165', '147.32.84.191', '147.32.84.192', '147.32.84.193',
                '147.32.84.204', '147.32.84.205', '147.32.84.206', '147.32.84.207',
                '147.32.84.208', '147.32.84.209']

new_data_test = data_51.groupby('src_ip').apply(lambda z: embedding_host_level(z)).reset_index()
new_data_test['label'] = new_data_test['src_ip'].apply(lambda z: check_infected(z, infected_ips)).values

host_y = new_data_test['label'].values
host_x = new_data_test.drop(['src_ip', 'label'], axis=1).values

host_x=np.array([v[0] for v in host_x])

host_y = np.array(host_y, dtype=int)

### Perform classification

In [20]:
# simple classifcation

# set the classifiers
clf_name = 'RandomForestClassifier'
clf = RandomForestClassifier(n_estimators=50, criterion='gini', class_weight='balanced')


# enter the classification phase for each level
eval_levels = ['packet', 'host']  # the 2 evaluation levels


for level in eval_levels:
    # prepare the data according to the level

    if level == 'host':
        usx = np.copy(host_x)
        usy = np.copy(host_y)
    else:
        usx = np.copy(packet_x)
        usy = np.copy(packet_y)


    # enter the classification phase
    print('\nStart the classification process')
    make_clf(usx, usy, clf, clf_name, level)
    make_clf_cross(usx, usy, clf, clf_name, level)


Start the classification process
---------- Hold-out with RandomForestClassifier at packet level ----------
TOTAL TP: 65571
TOTAL FP: 53
TOTAL FN: 62
TOTAL TN: 127922
TOTAL Accuracy: 0.999406016280319
TOTAL Precision: 0.9991923686456174
TOTAL Recall: 0.999055353252175
---------- Cross-validation with RandomForestClassifier at packet level ----------
TOTAL TP: 218541
TOTAL FP: 141
TOTAL FN: 234
TOTAL TN: 426442
TOTAL Accuracy: 0.9994189271691062
TOTAL Precision: 0.9993552281394902
TOTAL Recall: 0.9989304079533767

Start the classification process
---------- Hold-out with RandomForestClassifier at host level ----------
TOTAL TP: 3
TOTAL FP: 0
TOTAL FN: 0
TOTAL TN: 154
TOTAL Accuracy: 1.0
TOTAL Precision: 1.0
TOTAL Recall: 1.0
---------- Cross-validation with RandomForestClassifier at host level ----------
TOTAL TP: 10
TOTAL FP: 3
TOTAL FN: 0
TOTAL TN: 509
TOTAL Accuracy: 0.9942528735632183
TOTAL Precision: 0.7692307692307693
TOTAL Recall: 1.0
