Main source of the dataset files: https://github.com/splunk/attack_data/tree/master/datasets/attack_techniques/T1558.003

In [12]:
pip install python-evtx

Collecting python-evtx
  Downloading python_evtx-0.7.4-py3-none-any.whl.metadata (1.1 kB)
Collecting hexdump==3.3 (from python-evtx)
  Downloading hexdump-3.3.zip (12 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting more-itertools==5.0.0 (from python-evtx)
  Downloading more_itertools-5.0.0-py3-none-any.whl.metadata (33 kB)
Collecting zipp==1.0.0 (from python-evtx)
  Downloading zipp-1.0.0-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting configparser==4.0.2 (from python-evtx)
  Downloading configparser-4.0.2-py2.py3-none-any.whl.metadata (10 kB)
Collecting pyparsing==2.4.7 (from python-evtx)
  Downloading pyparsing-2.4.7-py2.py3-none-any.whl.metadata (3.6 kB)
Downloading python_evtx-0.7.4-py3-none-any.whl (35 kB)
Downloading configparser-4.0.2-py2.py3-none-any.whl (22 kB)
Downloading more_itertools-5.0.0-py3-none-any.whl (52 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.3/52.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyp

In [4]:
# Library Imports
import pandas as pd
from Evtx.Evtx import Evtx
import xml.etree.ElementTree as ET

In [6]:
file_name = 'windows-xml.log'  # This is only one of the files loaded for inspection

# Read the file and display it
#data = pd.read_csv(file_name, sep="\t", header=None)
#data

data = pd.read_csv(file_name, header=None, names=["log_entry"])
data.head()

Unnamed: 0,log_entry
0,<Event xmlns='http://schemas.microsoft.com/win...
1,<Event xmlns='http://schemas.microsoft.com/win...
2,<Event xmlns='http://schemas.microsoft.com/win...
3,<Event xmlns='http://schemas.microsoft.com/win...
4,<Event xmlns='http://schemas.microsoft.com/win...


The uploaded file 'windows-xml.log' contains log information of unusual number of requested Kerberos service tickets.

In [10]:
# Parse each XML log entry
def parse_event(xml_event):
    # Remove the namespace
    xml_event = xml_event.replace(" xmlns='http://schemas.microsoft.com/win/2004/08/events/event'", "")
    event = ET.fromstring(xml_event)

    # Extract fields of interest
    event_id = event.findtext('.//EventID')
    time_created = event.find('.//TimeCreated').attrib.get('SystemTime')
    computer = event.findtext('.//Computer')
    target_user = event.find(".//Data[@Name='TargetUserName']").text
    target_domain = event.find(".//Data[@Name='TargetDomainName']").text
    service_name = event.find(".//Data[@Name='ServiceName']").text
    ip_address = event.find(".//Data[@Name='IpAddress']").text
    ip_port = event.find(".//Data[@Name='IpPort']").text

    # Return a dictionary of the extracted data
    return {
        'EventID': event_id,
        'TimeCreated': time_created,
        'Computer': computer,
        'TargetUserName': target_user,
        'TargetDomainName': target_domain,
        'ServiceName': service_name,
        'IpAddress': ip_address,
        'IpPort': ip_port
    }

# Parse each log entry and store the results in a list of dictionaries
parsed_logs = [parse_event(log) for log in data["log_entry"]]

# Create a DataFrame from the parsed logs
df = pd.DataFrame(parsed_logs)

# Display the DataFrame
df

Unnamed: 0,EventID,TimeCreated,Computer,TargetUserName,TargetDomainName,ServiceName,IpAddress,IpPort
0,4769,2024-03-04T06:53:49.273165500Z,ar-win-dc.attackrange.local,AR-WIN-2$@ATTACKRANGE.LOCAL,ATTACKRANGE.LOCAL,krbtgt,::ffff:10.0.1.15,55795
1,4769,2024-03-04T04:51:59.167662600Z,ar-win-dc.attackrange.local,AR-WIN-DC$@ATTACKRANGE.LOCAL,ATTACKRANGE.LOCAL,krbtgt1,::1,0
2,4769,2024-03-04T04:25:06.207288400Z,ar-win-dc.attackrange.local,AR-WIN-DC$@ATTACKRANGE.LOCAL,ATTACKRANGE.LOCAL,krbtgt2,::1,0
3,4769,2024-03-04T04:24:07.805715800Z,ar-win-dc.attackrange.local,AR-WIN-2$@ATTACKRANGE.LOCAL,ATTACKRANGE.LOCAL,krbtgt3,::ffff:10.0.1.15,55224
4,4769,2024-03-03T21:22:44.188691400Z,ar-win-dc.attackrange.local,AR-WIN-2$@ATTACKRANGE.LOCAL,ATTACKRANGE.LOCAL,krbtgt4,::ffff:10.0.1.15,53609
...,...,...,...,...,...,...,...,...
154,4769,2024-02-29T14:52:58.473931900Z,ar-win-dc.attackrange.local,AR-WIN-DC$@ATTACKRANGE.LOCAL,ATTACKRANGE.LOCAL,krbtgt,::1,0
155,4769,2024-02-29T05:23:58.356720200Z,ar-win-dc.attackrange.local,AR-WIN-2$@ATTACKRANGE.LOCAL,ATTACKRANGE.LOCAL,krbtgt,::ffff:10.0.1.15,49749
156,4769,2024-02-29T05:23:56.911963600Z,ar-win-dc.attackrange.local,AR-WIN-2$@ATTACKRANGE.LOCAL,ATTACKRANGE.LOCAL,krbtgt,::ffff:10.0.1.15,49741
157,4769,2024-02-29T05:22:45.114780000Z,ar-win-dc.attackrange.local,AR-WIN-DC$@ATTACKRANGE.LOCAL,ATTACKRANGE.LOCAL,krbtgt,::1,0


In Windows security logs, event **ID 4769** is generated every time the Key Distribution Center (KDC) receives a Kerberos Ticket Granting Service (TGS) ticket request.

Next file is obtained from: https://github.com/ysnakst/Dataset-for-Kerberoasting/tree/main

In [7]:

file_path = 'All4769Events.evtx'

# To store parsed event information
events = []

# Read and display the XML content of the first few records to inspect the structure
with Evtx(file_path) as log:
    for i, record in enumerate(log.records()):
        print(f"Record {i + 1}:\n{record.xml()}\n")

        # Limit to first few records to avoid large output
        if i >= 4:
            break

Record 1:
<Event xmlns="http://schemas.microsoft.com/win/2004/08/events/event"><System><Provider Name="Microsoft-Windows-Security-Auditing" Guid="{54849625-5478-4994-a5ba-3e3b0328c30d}"></Provider>
<EventID Qualifiers="">4769</EventID>
<Version>0</Version>
<Level>0</Level>
<Task>14337</Task>
<Opcode>0</Opcode>
<Keywords>0x8020000000000000</Keywords>
<TimeCreated SystemTime="2024-03-19 23:08:07.978025"></TimeCreated>
<EventRecordID>8652</EventRecordID>
<Correlation ActivityID="" RelatedActivityID=""></Correlation>
<Execution ProcessID="656" ThreadID="3868"></Execution>
<Channel>Security</Channel>
<Computer>DC.cseclab.test</Computer>
<Security UserID=""></Security>
</System>
<EventData><Data Name="TargetUserName">DC$@CSECLAB.TEST</Data>
<Data Name="TargetDomainName">CSECLAB.TEST</Data>
<Data Name="ServiceName">DC$</Data>
<Data Name="ServiceSid">S-1-5-21-876779242-1562946419-1879789669-1000</Data>
<Data Name="TicketOptions">0x40810000</Data>
<Data Name="TicketEncryptionType">0x00000012</D

In [10]:
# Define the namespace
namespace = {'ns': 'http://schemas.microsoft.com/win/2004/08/events/event'}

events = []

# Read the .evtx file and parse entries
with Evtx(file_path) as log:
    for record in log.records():
        # Parse the XML content
        xml_entry = record.xml()
        event = ET.fromstring(xml_entry)

        # Extract relevant details using the namespace
        event_id = event.find('.//ns:EventID', namespace).text if event.find('.//ns:EventID', namespace) is not None else None
        timestamp = event.find('.//ns:TimeCreated', namespace).attrib.get('SystemTime') if event.find('.//ns:TimeCreated', namespace) is not None else None
        target_user = event.find(".//ns:Data[@Name='TargetUserName']", namespace).text if event.find(".//ns:Data[@Name='TargetUserName']", namespace) is not None else None
        service_name = event.find(".//ns:Data[@Name='ServiceName']", namespace).text if event.find(".//ns:Data[@Name='ServiceName']", namespace) is not None else None
        ip_address = event.find(".//ns:Data[@Name='IpAddress']", namespace).text if event.find(".//ns:Data[@Name='IpAddress']", namespace) is not None else None

        # Append extracted data to events list
        events.append({
            "EventID": event_id,
            "Timestamp": timestamp,
            "TargetUserName": target_user,
            "ServiceName": service_name,
            "IpAddress": ip_address
        })

# Create a DataFrame from the events list
df1 = pd.DataFrame(events)

# Display the entire DataFrame
#print(df1)
df1

Unnamed: 0,EventID,Timestamp,TargetUserName,ServiceName,IpAddress
0,4769,2024-03-19 23:08:07.978025,DC$@CSECLAB.TEST,DC$,::1
1,4769,2024-03-19 23:08:09.023266,DC$@CSECLAB.TEST,DC$,::1
2,4769,2024-03-19 23:08:09.023649,DC$@CSECLAB.TEST,krbtgt,::1
3,4769,2024-03-19 23:08:09.274817,DC$@CSECLAB.TEST,DC$,::1
4,4769,2024-03-19 23:08:09.414671,DC$@CSECLAB.TEST,DC$,::1
...,...,...,...,...,...
5979,4769,2024-04-09 21:37:50.402313,DC$@CSECLAB.TEST,DC$,::1
5980,4769,2024-04-09 21:37:50.436304,DC$@CSECLAB.TEST,DC$,::1
5981,4769,2024-04-09 21:41:34.573837,DC$@CSECLAB.TEST,DC$,::1
5982,4769,2024-04-09 21:46:45.458099,DC$@CSECLAB.TEST,DC$,::1
