In [None]:
import os
import sys
import warnings
from elk import ElasticSearch
import config as config
import numpy as np
import collections
import math
from sklearn.ensemble import IsolationForest
from sklearn.metrics import confusion_matrix

warnings.filterwarnings("ignore")

def log_querier(index, column, stime, etime = None, rules_should = None, rules_must = None, rules_must_not = None):
    es = ElasticSearch(config.es.host, config.es.cred)
    es.index(index)
    es.column(column)
    if etime == None:
        es.time(stime)
    else:
        es.time(stime, etime)
    if rules_should != None:
        es.should(rules_should)
    if rules_must != None:
        es.must(rules_must)
    if rules_must_not != None:
        es.must_not(rules_must_not)
    data = es.search()
    return data


def estimate_shannon_entropy(dna_sequence):
    m = len(dna_sequence)
    bases = collections.Counter([tmp_base for tmp_base in dna_sequence])
 
    shannon_entropy_value = 0
    for base in bases:
        # number of residues
        n_i = bases[base]
        # n_i (# residues type i) / M (# residues in column)
        p_i = n_i / float(m)
        entropy_i = p_i * (math.log(p_i, 2))
        shannon_entropy_value += entropy_i
 
    return shannon_entropy_value * -1


def iforest(train, test):
    print("Model is Establishing...\n")
    # fit the model
    clf = IsolationForest(contamination = 0.1, random_state = 250)
    clf.fit(train)
    pred_array = clf.predict(test)

    # true label
    true_array = []
    for i in range(335):
        true_array.append(-1)
    for j in range(2):
        true_array.append(1)


    print("Confusion Matrix:\n")
    print("TN\tFP\nFN\tTP")
    a = confusion_matrix(true_array, pred_array.tolist())
    print(a)
    accuracy = (a[0][0]+a[1][1])/(a[0][0]+a[0][1]+a[1][0]+a[1][1])
    print("\nAccuracy: ", accuracy)


def trained_data():
    log_index = "logstash-host.winlogbeat*"
    log_index_columns = ['winlog.event_data.QueryName']
    log_rule_should = None
    log_rule_must = [{'winlog.event_id':22}]
    log_rule_must_not = None
    log_start_time = "2021-05-09T20:00:00+0800"
    log_end_time = "2021-05-20T15:00:00+0800"
    
    data = log_querier(log_index, log_index_columns, log_start_time, log_end_time, log_rule_should, log_rule_must, log_rule_must_not)

    
    upper = list()
    for i in range(len(data)):
        a = data[i]['winlog']['event_data']['QueryName']
        count1 = 0
        for j in a:
            if(j.isupper()):
                count1 = count1 + 1
        upper.append(count1)


    numeric = list()
    for i in range(len(data)):
        a = data[i]['winlog']['event_data']['QueryName']
        count2 = 0
        for j in a:
            if(j.isnumeric()):
                count2 = count2 + 1
        numeric.append(count2)
    
    
    entropy = list()
    for i in range(len(data)):
        a = data[i]['winlog']['event_data']['QueryName']
        entropy.append(estimate_shannon_entropy(a))
    
    
    maxlabellen = list()
    for i in range(len(data)):
        a = data[i]['winlog']['event_data']['QueryName']
        b = a.split(".")
        d = list()
        for j in range(len(b)):
            c = len(b[j])
            d.append(c)
        maxlabellen.append(max(d))
    
    
    for i in range(337):
        a = data[i]['winlog']['event_data']['QueryName']
        x = [[len(a), len(a.split(".")[0]), upper[i], numeric[i], entropy[i], len(a.split(".")), maxlabellen[i], (len(a) - len(a.split(".")) + 1) / len(a.split("."))]]
        #   [Total count of characters in FQDN, count of characters in subdomain, count of uppercase characters, count of numerical characters, entropy, # of labels, maximum label length, average label length]
        
        if (i == 0):
            train = x
        else:
            train = np.append(train, x, axis = 0)
    
    return train
    
def test_data():
    log_index = "logstash-victim.winlogbeat*"
    log_index_columns = ['winlog.event_data.QueryName']
    log_rule_should = None
    log_rule_must = [{'winlog.event_id':22}]
    log_rule_must_not = None
    log_start_time = "2021-07-15T10:30:00+0800"
    log_end_time = "2021-07-15T12:00:00+0800"
    
    data = log_querier(log_index, log_index_columns, log_start_time, log_end_time, log_rule_should, log_rule_must, log_rule_must_not)

    
    upper = list()
    for i in range(len(data)):
        a = data[i]['winlog']['event_data']['QueryName']
        count1 = 0
        for j in a:
            if(j.isupper()):
                count1 = count1 + 1
        upper.append(count1)


    numeric = list()
    for i in range(len(data)):
        a = data[i]['winlog']['event_data']['QueryName']
        count2 = 0
        for j in a:
            if(j.isnumeric()):
                count2 = count2 + 1
        numeric.append(count2)
    
    
    entropy = list()
    for i in range(len(data)):
        a = data[i]['winlog']['event_data']['QueryName']
        entropy.append(estimate_shannon_entropy(a))
    
    
    maxlabellen = list()
    for i in range(len(data)):
        a = data[i]['winlog']['event_data']['QueryName']
        b = a.split(".")
        d = list()
        for j in range(len(b)):
            c = len(b[j])
            d.append(c)
        maxlabellen.append(max(d))
    
    
    for i in range(len(data)):
        a = data[i]['winlog']['event_data']['QueryName']
        x = [[len(a), len(a.split(".")[0]), upper[i], numeric[i], entropy[i], len(a.split(".")), maxlabellen[i], (len(a) - len(a.split(".")) + 1) / len(a.split("."))]]
        #   [Total count of characters in FQDN, count of characters in subdomain, count of uppercase characters, count of numerical characters, entropy, # of labels, maximum label length, average label length]
        
        if (i == 0):
            test = x
        else:
            test = np.append(test, x, axis = 0)
    
    print("Collection of 8 Vectors of Logs is Done (base32 encoded)\n")
    return test

def main():
    iforest(trained_data(), test_data())
    
if __name__ == '__main__':
    main()