In [5]:
import pandas as pd
import re
import lxml.etree as ET

# Load data
df = pd.read_csv(
    'git_log_processed.csv',
    sep=',',
    parse_dates=['timestamp'],
)
df = df[['release','timestamp','author','change_type']]
#trim from release all spaces
df['release'] = df['release'].str.strip()

In [6]:
# Sanitize 'release' values to remove special characters
def sanitize_release(value):
    # Remove any character that is not alphanumeric or a dot
    return re.sub(r'[^A-Za-z0-9\.]', '_', value)

df['release'] = df['release'].apply(sanitize_release)

# Ensure timestamps are in datetime format and convert to ISO 8601 strings with timezone offset in '+HH:MM' format
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce", utc=True)
df = df.dropna(subset=['timestamp', 'author', 'change_type'])
df["timestamp"] = df["timestamp"].dt.strftime('%Y-%m-%dT%H:%M:%S.%f%z')
# Correct the timezone format from '+0000' to '+00:00'
df["timestamp"] = df["timestamp"].str.replace(r'([+-]\d{2})(\d{2})$', r'\1:\2', regex=True)

# Take only the first 100 rows
df = df.head(100)

# Define XES namespace
xes_ns = "http://www.xes-standard.org/"

# Create the root element with namespace
root = ET.Element("{%s}log" % xes_ns, {
    "xes.version": "1.0",
    "xes.features": "nested-attributes",
    "openxes.version": "1.0RC7"
}, nsmap={None: xes_ns})

# Add comments
comments = [
    "This file has been generated with the OpenXES library. It conforms",
    "to the XML serialization of the XES standard for log storage and",
    "management.",
    "XES standard version: 1.0",
    "OpenXES library version: 1.0RC7",
    "OpenXES is available from http://www.openxes.org/"
]

for comment in comments:
    root.append(ET.Comment(comment))

# Add extensions
extensions = [
    ("Organizational", "org", "http://www.xes-standard.org/org.xesext"),
    ("Time", "time", "http://www.xes-standard.org/time.xesext"),
    ("Lifecycle", "lifecycle", "http://www.xes-standard.org/lifecycle.xesext"),
    ("Semantic", "semantic", "http://www.xes-standard.org/semantic.xesext"),
    ("Concept", "concept", "http://www.xes-standard.org/concept.xesext"),
]
for name, prefix, uri in extensions:
    ET.SubElement(root, "{%s}extension" % xes_ns, {
        "name": name, "prefix": prefix, "uri": uri
    })

# Add global attributes
globals_trace = ET.SubElement(root, "{%s}global" % xes_ns, {"scope": "trace"})
ET.SubElement(globals_trace, "{%s}string" % xes_ns, {
    "key": "concept:name", "value": "__INVALID__"
})
globals_event = ET.SubElement(root, "{%s}global" % xes_ns, {"scope": "event"})
ET.SubElement(globals_event, "{%s}string" % xes_ns, {
    "key": "concept:name", "value": "__INVALID__"
})
ET.SubElement(globals_event, "{%s}date" % xes_ns, {
    "key": "time:timestamp", "value": "1970-01-01T00:00:00.000+00:00"
})
ET.SubElement(globals_event, "{%s}string" % xes_ns, {
    "key": "lifecycle:transition", "value": "__INVALID__"
})

# Add classifiers
ET.SubElement(root, "{%s}classifier" % xes_ns, {
    "name": "MXML Legacy Classifier",
    "keys": "concept:name lifecycle:transition"
})
ET.SubElement(root, "{%s}classifier" % xes_ns, {
    "name": "Event Name",
    "keys": "concept:name"
})

# Add general attributes
ET.SubElement(root, "{%s}string" % xes_ns, {
    "key": "concept:name", "value": "Generated Log"
})
ET.SubElement(root, "{%s}string" % xes_ns, {
    "key": "lifecycle:model", "value": "standard"
})
ET.SubElement(root, "{%s}string" % xes_ns, {
    "key": "source", "value": "DataFrame Export"
})

# Create traces
for case_id, group in df.groupby("release"):
    trace = ET.SubElement(root, "{%s}trace" % xes_ns)
    # Use the sanitized release value as the concept:name of the trace
    ET.SubElement(trace, "{%s}string" % xes_ns, {
        "key": "concept:name", "value": str(case_id)
    })
    for _, row in group.iterrows():
        event = ET.SubElement(trace, "{%s}event" % xes_ns)
        ET.SubElement(event, "{%s}string" % xes_ns, {
            "key": "concept:name", "value": row["change_type"]
        })
        ET.SubElement(event, "{%s}string" % xes_ns, {
            "key": "lifecycle:transition", "value": "complete"
        })
        ET.SubElement(event, "{%s}date" % xes_ns, {
            "key": "time:timestamp", "value": row["timestamp"]
        })
        ET.SubElement(event, "{%s}string" % xes_ns, {
            "key": "org:resource", "value": row["author"]
        })

# Write to XES file with proper XML declaration and formatting
tree = ET.ElementTree(root)
output_path = "git_log_sanitized.xes"
with open(output_path, "wb") as f:
    f.write(b'<?xml version="1.0" encoding="UTF-8" ?>\n')
    tree.write(f, encoding="utf-8", pretty_print=True, xml_declaration=False)

print(f"XES file written to {output_path}")

XES file written to git_log_sanitized.xes
