In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils import read_data
from gensim.models import FastText
from copy import deepcopy
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    from imblearn.over_sampling import SMOTE

## Helper functions

In [2]:
def make_clf(usx, usy, clf, clf_name, level):
    """
    Function for the classification task - Trains and tests the classifier clf using 10-fold cross-validation
    The sampling parameter sets the type of sampling to be used
    :param usx: the input instances
    :param usy: the labels of the instances
    :param clf: the classifier to be used
    :param clf_name: the name of the classifier (for plotting reasons)
    :param level: the evaluation level (for plotting reasons)
    :return: the classification results
    """
    print('---------- Hold-out with {} at {} level ----------'.format(clf_name, level))
    
    x_train, x_test, y_train, y_test = train_test_split(usx, usy, test_size=0.3, stratify=usy,random_state=42)
    
    totalTP, totalFP, totalFN, totalTN = 0, 0, 0, 0
    # apply SMOTE for imbalance issues
    x_train, y_train = SMOTE(sampling_strategy=0.5, k_neighbors=1).fit_resample(x_train, y_train)

    # fit the model and make predictions
    clf.fit(x_train, y_train)
    y_predict = clf.predict(x_test)

    for i in range(len(y_predict)):
        if y_test[i] and y_predict[i]:
            totalTP += 1
        if not y_test[i] and y_predict[i]:
            totalFP += 1
        if y_test[i] and not y_predict[i]:
            totalFN += 1
        if not y_test[i] and not y_predict[i]:
            totalTN += 1

    # just in case that no TP or FP are found
    if not (totalTP + totalFP):
        precision = 1
    else:
        precision = totalTP / (totalTP + totalFP)
    recall = totalTP / (totalTP + totalFN)
    accuracy = (totalTP + totalTN) / (totalTP + totalFN + totalTN + totalFP)
    print('TOTAL TP: ' + str(totalTP))
    print('TOTAL FP: ' + str(totalFP))
    print('TOTAL FN: ' + str(totalFN))
    print('TOTAL TN: ' + str(totalTN))
    print('TOTAL Accuracy: ' + str(accuracy))
    print('TOTAL Precision: ' + str(precision))
    print('TOTAL Recall: ' + str(recall))

In [3]:
def make_clf_cross(usx, usy, clf, clf_name, level):
    """
    Function for the classification task - Trains and tests the classifier clf using 10-fold cross-validation
    The sampling parameter sets the type of sampling to be used
    :param usx: the input instances
    :param usy: the labels of the instances
    :param clf: the classifier to be used
    :param clf_name: the name of the classifier (for plotting reasons)
    :param level: the evaluation level (for plotting reasons)
    :return: the classification results
    """
    print('---------- Cross-validation with {} at {} level ----------'.format(clf_name, level))
    totalTP, totalFP, totalFN, totalTN = 0, 0, 0, 0
    j = 0
    skf = StratifiedKFold(n_splits=3, shuffle=True)  # apply 10-fold stratified cross validation
    for train_index, test_index in skf.split(usx, usy):

        # split data in training and test set
        x_train, x_test = usx[train_index], usx[test_index]
        y_train, y_test = usy[train_index], usy[test_index]
        # apply SMOTE for imbalance issues
        x_train, y_train = SMOTE(sampling_strategy=0.5, k_neighbors=1).fit_resample(x_train, y_train)

        # create_clusters(x_train, y_train, train_ips)  # TODO: not fully implemented yet - decisions still to be made

        # fit the model and make predictions
        clf.fit(x_train, y_train)
        y_predict = clf.predict(x_test)

        for i in range(len(y_predict)):
            if y_test[i] and y_predict[i]:
                totalTP += 1
            if not y_test[i] and y_predict[i]:
                totalFP += 1
            if y_test[i] and not y_predict[i]:
                totalFN += 1
            if not y_test[i] and not y_predict[i]:
                totalTN += 1
        j += 1

    # just in case that no TP or FP are found
    if not (totalTP + totalFP):
        precision = 1
    else:
        precision = totalTP / (totalTP + totalFP)
    recall = totalTP / (totalTP + totalFN)
    accuracy = (totalTP + totalTN) / (totalTP + totalFN + totalTN + totalFP)
    print('TOTAL TP: ' + str(totalTP))
    print('TOTAL FP: ' + str(totalFP))
    print('TOTAL FN: ' + str(totalFN))
    print('TOTAL TN: ' + str(totalTN))
    print('TOTAL Accuracy: ' + str(accuracy))
    print('TOTAL Precision: ' + str(precision))
    print('TOTAL Recall: ' + str(recall))

In [4]:
def check_infected(val, infected_ips):
    """
    Function to assign labels to instances given their ip
    :param val: the flow to be checked
    :param infected_ips: the list of infected hosts
    :return: 1 if val is in the infected list otherwise 0
    """
    return 1 if val in infected_ips else 0

In [5]:
def embedding_host_level(z):
    x = []
    for i,flow in enumerate(z.iterrows()):
        x.append(list(model.wv[z.iloc[i]].sum(axis=0)))
    x = np.array(x)
    return x.mean(axis=0)

In [6]:
def discretize_protocol(item):
    return list(protocols).index(item)

## Read data

In [7]:
infected_host = '147.32.84.165'
data = read_data('datasets/CTU-Malware-Capture-Botnet-52')
data.to_pickle('./data_52.pkl')
# data = pd.read_pickle('./data_52.pkl')

In [8]:
# Used to keep the original data
data_original = deepcopy(data)

# Discretize data


In [9]:
# discretize protocol
protocols = set(data.protocol.unique())
data['protocol'] = data['protocol'].apply(lambda x: list(protocols).index(x))

In [10]:
# discretize flags

flags = set(data.flags.unique())
data['flags'] = data['flags'].apply(lambda x: list(flags).index(x))

In [11]:
# discretize label

data['label'] = data['label'].replace('Botnet', 0)
data['label'] = data['label'].replace('LEGITIMATE', 1)

In [12]:
# remove background flows and date

data = data[data['label'] != 'Background']
data.drop(['date'],1, inplace=True)

In [13]:
# replace 'na' values in  src and dst port

data['src_port'] = data['src_port'].replace('na', 0)
data['dst_port'] = data['dst_port'].replace('na', 0)

In [14]:
data.reset_index(inplace=True)
data.head()

Unnamed: 0,index,duration,protocol,src_ip,src_port,dst_ip,dst_port,flags,tos,packets,bytes,flows,label
0,21,0.0,5,147.32.86.194,3972,173.194.1.27,80,96,0,2,108,1,1
1,22,0.015,5,147.32.86.194,3974,74.125.108.185,80,96,0,2,108,1,1
2,35,0.0,5,147.32.84.28,56781,147.32.80.13,80,9,0,2,132,1,1
3,36,0.117,5,147.32.84.164,57264,64.12.68.227,443,72,0,2,114,1,1
4,37,0.117,5,147.32.84.164,43869,64.12.73.134,443,72,0,2,114,1,1


## Classification without embeddings

In [15]:
# simple classifcation

# set the classifiers
clf_name = 'RandomForestClassifier'
clf = RandomForestClassifier(n_estimators=50, criterion='gini', class_weight='balanced')

infected_ips = ['147.32.84.165', '147.32.84.191', '147.32.84.192']

# enter the classification phase for each level
eval_levels = ['packet', 'host']  # the 2 evaluation levels
# eval_levels = ['host']  # the 2 evaluation levels


for level in eval_levels:
    # prepare the data according to the level

    if level == 'host':
        new_data = data.groupby('src_ip').mean().reset_index()
#         new_data = data.groupby('src_ip').sum().reset_index()

    else:
        new_data = deepcopy(data)
        
    # label the processed dataset(s)
    new_data['label'] = new_data['src_ip'].apply(lambda z: check_infected(z, infected_ips))

    # separate the labels from the rest of the dataset
    y = new_data['label'].values
    if 'dst_ip' in new_data:
        x = new_data.drop(['src_ip','dst_ip', 'label'], axis=1).values
    else:
        x = new_data.drop(['src_ip', 'label'], axis=1).values

    # enter the classification phase
    print('Start the classification process')
    usx = np.copy(x)
    usy = np.copy(y)
    make_clf_cross(usx, usy, clf, clf_name, level)
    make_clf(usx, usy, clf, clf_name, level)

Start the classification process
---------- Cross-validation with RandomForestClassifier at packet level ----------
TOTAL TP: 277671
TOTAL FP: 1
TOTAL FN: 5
TOTAL TN: 11225
TOTAL Accuracy: 0.9999792317117916
TOTAL Precision: 0.9999963986285978
TOTAL Recall: 0.9999819934023826
---------- Hold-out with RandomForestClassifier at packet level ----------
TOTAL TP: 83298
TOTAL FP: 0
TOTAL FN: 5
TOTAL TN: 3368
TOTAL Accuracy: 0.9999423105767788
TOTAL Precision: 1.0
TOTAL Recall: 0.9999399781520474
Start the classification process
---------- Cross-validation with RandomForestClassifier at host level ----------
TOTAL TP: 2
TOTAL FP: 0
TOTAL FN: 1
TOTAL TN: 117
TOTAL Accuracy: 0.9916666666666667
TOTAL Precision: 1.0
TOTAL Recall: 0.6666666666666666
---------- Hold-out with RandomForestClassifier at host level ----------
TOTAL TP: 1
TOTAL FP: 0
TOTAL FN: 0
TOTAL TN: 35
TOTAL Accuracy: 1.0
TOTAL Precision: 1.0
TOTAL Recall: 1.0


## Classification with embeddings

### Split data for model training

In [16]:
# split data for the fasttext model training and testing

data_train, data_test = train_test_split(data, test_size=0.5, stratify=data['label'])

### Train FASTTEXT model

In [17]:
# train model

df = data_train.apply(lambda x: ','.join(x.astype(str)), axis=1)
sent = [row.split(',') for row in df]
model = FastText(sent, min_count=1, size=50, workers=2, window=3, sg=1)

# save model
fname = 'models/fasttext.model'
model.save(fname)

# load trained model
# fname = 'models/fasttext.model'
# model = FastText.load(fname)

### Build the train data for packet level

In [18]:
# Build the train data for packet level

packet_x = []
infected_ips = ['147.32.84.165', '147.32.84.191', '147.32.84.192']

data_test['label'] = data_test['src_ip'].apply(lambda z: check_infected(z, infected_ips))
dataset = data_test.drop('label', 1)

packet_y = data_test['label'].values
for i,flow in enumerate(dataset.iterrows()):
    packet_x.append(list(model.wv[dataset.iloc[i]].sum(axis=0)))

packet_x = np.array(packet_x)
packet_y = np.array(packet_y, dtype=int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Build the train data for host level


In [19]:

infected_ips = ['147.32.84.165', '147.32.84.191', '147.32.84.192']

new_data_test = data_test.groupby('src_ip').apply(lambda z: embedding_host_level(z)).reset_index()
new_data_test['label'] = new_data_test['src_ip'].apply(lambda z: check_infected(z, infected_ips))

host_y = new_data_test['label'].values
host_x = new_data_test.drop(['src_ip', 'label'], axis=1).values

host_x=np.array([v[0] for v in host_x])

host_y = np.array(host_y, dtype=int)

### Perform classification

In [21]:
# simple classifcation

# set the classifiers
clf_name = 'RandomForestClassifier'
clf = RandomForestClassifier(n_estimators=50, criterion='gini', class_weight='balanced')


# enter the classification phase for each level
eval_levels = ['packet', 'host']  # the 2 evaluation levels


for level in eval_levels:
    # prepare the data according to the level

    if level == 'host':
        usx = np.copy(host_x)
        usy = np.copy(host_y)
    else:
        usx = np.copy(packet_x)
        usy = np.copy(packet_y)


    # enter the classification phase
    print('\nStart the classification process')
    make_clf(usx, usy, clf, clf_name, level)
    make_clf_cross(usx, usy, clf, clf_name, level)


Start the classification process
---------- Hold-out with RandomForestClassifier at packet level ----------
TOTAL TP: 41652
TOTAL FP: 2
TOTAL FN: 3
TOTAL TN: 1679
TOTAL Accuracy: 0.9998846224847702
TOTAL Precision: 0.9999519854035627
TOTAL Recall: 0.9999279798343537
---------- Cross-validation with RandomForestClassifier at packet level ----------
TOTAL TP: 138838
TOTAL FP: 3
TOTAL FN: 10
TOTAL TN: 5600
TOTAL Accuracy: 0.99991000408443
TOTAL Precision: 0.9999783925497512
TOTAL Recall: 0.9999279787969578

Start the classification process
---------- Hold-out with RandomForestClassifier at host level ----------
TOTAL TP: 0
TOTAL FP: 0
TOTAL FN: 1
TOTAL TN: 30
TOTAL Accuracy: 0.967741935483871
TOTAL Precision: 1
TOTAL Recall: 0.0
---------- Cross-validation with RandomForestClassifier at host level ----------
TOTAL TP: 2
TOTAL FP: 1
TOTAL FN: 1
TOTAL TN: 98
TOTAL Accuracy: 0.9803921568627451
TOTAL Precision: 0.6666666666666666
TOTAL Recall: 0.6666666666666666
