# Identifying malicious events by analysing Sysmon logs

In [1]:
from IPython.display import clear_output

In [None]:
# Downloads
! git clone https://github.com/dtrizna/SysmonRNN.git
! mv /content/SysmonRNN/data/logs.ft /content/sysmon.ft
! mv /content/SysmonRNN/data/logs_pretty.xml /content/sysmon.xml
! mv /content/SysmonRNN/data/pid_malicious.lst /content/pid_malicious.lst 
! rm -rf /content/SysmonRNN

clear_output()

In [275]:
# Imports
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier

In [4]:
def GetMaliciousIDs(path="/content/pid_malicious.lst"):
    """
        Read malicious file and get malicious process ids
    """
    with open(path, "r") as f:
        malicious = f.readlines()

    for i in range(len(malicious)) :
        if(i!=len(malicious) - 1):
            malicious[i] = malicious[i][:-1]
        malicious[i] = int(malicious[i])

    return malicious

In [296]:
def ParseXMLData(maliciousIDs, XMLPath="/content/sysmon.xml"):
    """
        Read the XML file and create a Pandas Dataframe
        Additionally save it as a csv file for future use
    """
    tree = ET.parse(XMLPath)
    XMLNS = "{http://schemas.microsoft.com/win/2004/08/events/event}"
    root = tree.getroot();
    events = root.findall(XMLNS + "Event");
    sysmonData = []
    for event in events:
        data = {}
        data["Valid"] = 1

        systemData = event.find(XMLNS + "System")
        systemDataTags = [node.tag for node in systemData.iter()][1:]
        for tag in systemDataTags:
            node = systemData.find(tag)
            nodeName = tag[tag.find(XMLNS) + len(XMLNS) : ]
            if(node.text is not None):
                data[nodeName] = node.text
            if(len(node.attrib)!=0):
                data.update(node.attrib)

        eventData = event.find(XMLNS + "EventData")
        eventDataNodes = eventData.findall(XMLNS + "Data")
        for node in eventDataNodes:
            nodeName = node.attrib["Name"]
            if(nodeName=="RuleName"):
                continue
            elif(nodeName=="ProcessId"):
                data[nodeName] = int(node.text)
            else:
                data[nodeName] = node.text

        if("ProcessId" in data.keys() and data["ProcessId"] in maliciousIDs):
            data["Valid"] = 0

        sysmonData.append(data)

    sysmonDF = pd.DataFrame(sysmonData)
    sysmonDF.sort_values(by=["ProcessId", "UtcTime"], ascending=True, inplace=True)
    sysmonDF["UtcTime"] = pd.to_datetime(sysmonDF["UtcTime"])
    sysmonDF.dropna(subset="ProcessId", inplace=True)
    sysmonDF["ProcessId"] = sysmonDF["ProcessId"].astype('int64')
    sysmonDF.reset_index(inplace=True, drop=True)
    sysmonDF.to_csv("sysmon.csv", index=False)

In [297]:
maliciousIDs = GetMaliciousIDs()
XMLPath = "/content/sysmon.xml"
ParseXMLData(maliciousIDs=maliciousIDs, XMLPath=XMLPath)

In [298]:
sysmonDF = pd.read_csv("sysmon.csv")

In [320]:
# Remove processes with less than 5 events as they cannot be malicious
eventCount = sysmonDF["ProcessId"].value_counts()
dropIndex = eventCount[eventCount < 5].index.astype('int64')

sysmonDF = sysmonDF[~sysmonDF.ProcessId.isin(dropIndex)]
sysmonDF.reset_index(inplace=True)

In [321]:
# Discard columns that do not add value
usefulColumns = ["Valid", "ProcessId", "UtcTime", "Image", "ImageLoaded", "Description", "TargetFilename", "SourceIp", "DestinationIp", "CommandLine"]
sysmonDF = sysmonDF[usefulColumns]

In [322]:
def GenerateName(x) :
    """
        Extract the file name from the absolute address in the dataframe
    """
    if(pd.isnull(x)): return x
    else: 
        start = x.rfind('\\')
        if(start==-1): return x
        return x[x.rfind('\\') + 1 : ]

In [323]:
# Extract file names
sysmonDF["Image"] = sysmonDF["Image"].apply(GenerateName)
sysmonDF["ImageLoaded"] = sysmonDF["ImageLoaded"].apply(GenerateName)
sysmonDF["TargetFilename"] = sysmonDF["TargetFilename"].apply(GenerateName)
sysmonDF["CommandLine"] = sysmonDF["CommandLine"].apply(GenerateName)

In [324]:
class EncodeData:
    """
        Generate index-based encoding of the data
        This is done as the data is categorical
    """
    def __init__(self, df):
        self.df = df
        self.imageDict = self.GenerateImageEncoding()
        self.descDict = self.GenerateDescEncoding()
        self.fileDict = self.GenerateFileEncoding()
        self.commDict = self.GenerateCommandEncoding()

    def GenerateImageEncoding(self):
        imageNames = self.df["Image"].unique().tolist()
        loadedImageNames = self.df["ImageLoaded"].unique().tolist()
        imageNames = list(set(imageNames + loadedImageNames))
        
        encoding = {}
        for index, name in enumerate(imageNames):
            encoding[name] = index + 1

        return encoding

    def GenerateDescEncoding(self):
        descriptions = self.df["Description"].unique().tolist()

        encoding = {}
        for index, desc in enumerate(descriptions):
            encoding[desc] = index + 1

        return encoding

    def GenerateFileEncoding(self):
        files = self.df["TargetFilename"].unique().tolist()

        encoding = {}
        for index, _file in enumerate(files):
            encoding[_file] = index + 1

        return encoding

    def GenerateCommandEncoding(self):
        commands = self.df["CommandLine"].unique().tolist()

        encoding = {}
        for index, command in enumerate(commands):
            encoding[command] = index

        return encoding

    def EncodeImageName(self, x):
        if(pd.isnull(x)): return -1
        return self.imageDict[x]

    def EncodeDescription(self, x):
        if(pd.isnull(x)): return -1
        return self.descDict[x]

    def EncodeFileName(self, x):
        if(pd.isnull(x)): return -1
        return self.fileDict[x]

    def EncodeCommandLine(self, x):
        if(pd.isnull(x)): return -1
        return self.commDict[x]

In [325]:
# Generate encoding of the data
encoder = EncodeData(sysmonDF)

dataDF = pd.DataFrame()
dataDF["ProcessId"] = sysmonDF["ProcessId"]
dataDF["UtcTime"] = sysmonDF["UtcTime"]
dataDF["Image"] = sysmonDF["Image"].apply(encoder.EncodeImageName)
dataDF["ImageLoaded"] = sysmonDF["ImageLoaded"].apply(encoder.EncodeImageName)
dataDF["File"] = sysmonDF["TargetFilename"].apply(encoder.EncodeFileName)
dataDF["Description"] = sysmonDF["Description"].apply(encoder.EncodeDescription)
dataDF["CommandLine"] = sysmonDF["CommandLine"].apply(encoder.EncodeCommandLine)
dataDF["Valid"] = sysmonDF["Valid"]
dataDF.sort_values(by=["ProcessId", "UtcTime"], ascending=True, inplace=True)

In [326]:
dataDF.head()

Unnamed: 0,ProcessId,UtcTime,Image,ImageLoaded,File,Description,CommandLine,Valid
0,4,2020-05-12 14:53:20.370,198,-1,-1,-1,-1,1
1,4,2020-05-12 14:53:52.112,198,-1,-1,-1,-1,1
2,4,2020-05-12 14:56:49.452,198,-1,-1,-1,-1,1
3,4,2020-05-12 14:56:49.507,198,-1,-1,-1,-1,1
4,4,2020-05-12 14:56:49.668,198,-1,-1,-1,-1,1


In [329]:
def GenerateTrainTestSplit(df, windowSize=128, test_size=0.2):
    process_ids = df["ProcessId"].unique().tolist()
    X = np.zeros((len(process_ids), windowSize * 5))
    y = np.zeros(len(process_ids), dtype=np.int64)

    for index, process in enumerate(process_ids):
        processDF = df[df["ProcessId"]==process].reset_index(drop=True)
        processDF.head()
        numEvents = len(processDF)
        if(numEvents >= windowSize) :
            data = processDF.iloc[numEvents-windowSize:, 2:7].values
        else:
            data = processDF.iloc[:, 2:7].values
        data = data.reshape((1, len(data) * 5))

        X[index, :data.shape[1]] = data
        y[index] = processDF.Valid.iloc[0]
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, shuffle=True)

    # return np.array(X_train), np.asarray(X_test), np.asarray(y_train), np.asarray(y_test)
    return X_train, X_test, y_train, y_test

In [330]:
X_train, X_test, y_train, y_test = GenerateTrainTestSplit(dataDF, 128, 0.1)

In [331]:
classifier = XGBClassifier(objective="binary:logistic",
                           learning_rate=3e-4,
                           n_estimators=1000,
                           max_depth=8,
                           early_stopping_rounds=3
                           )

classifier.fit(X_train, y_train,
               eval_set=[(X_train, y_train), (X_test, y_test)],
               verbose=False)


# Performance on train data

## F1 score of 0.91 on train data

In [332]:
y_pred_train = classifier.predict(X_train)
confusion_matrix(y_train, y_pred_train)

array([[16,  1],
       [ 2, 53]])

# Performance on test data
## F1 score of 0.67 on test data

In [333]:
y_pred_test = classifier.predict(X_test)
confusion_matrix(y_test, y_pred_test)

array([[1, 1],
       [0, 6]])