In [26]:
from elasticsearch7 import Elasticsearch
import numpy as np
import pandas as pd
import json

In [27]:
es = Elasticsearch(hosts="http://localhost:9200")

In [28]:
response = es.search(index="fluentd-k8s",body={"track_total_hits": True})
hit_count = response["hits"]["total"]["value"]

In [30]:
hit_count

10000

In [21]:
response = es.search(index="fluentd-k8s", size=hit_count)

In [22]:
docs = response["hits"]["hits"]

In [23]:
len(docs)

8617

In [8]:
docs = response["hits"]["hits"]
fields = {}
for num, doc in enumerate(docs):
    source_data = doc["_source"]
    for key, val in source_data.items():
        if key == "log":
            try:
                fields[key] = np.append(fields[key], val)
            except KeyError:
                fields[key] = np.array([val])
    # do stuff here!

In [10]:
fields

{'log': array(['{"time:timestamp": "2023-04-23 15:18:53,689","case:concept:name":"0x9a6cd07b69bd1edbc51a6ccbf9bd24fb","concept:name": "Request User Address","message": "Request User Address","tilt": "{\'data_disclosed\': [\'user.id\'], \'purposes\': [\'personal data access\'], \'legal_bases\': [\'GDPR-6-1-b\']}"}\n',
        '{"time:timestamp": "2023-04-23 15:18:54,192","case:concept:name":"0x286109616e05cb5061ad956b3af40de9","concept:name": "Request User Address","message": "Request User Address","tilt": "{\'data_disclosed\': [\'user.id\'], \'purposes\': [\'personal data access\'], \'legal_bases\': [\'GDPR-6-1-b\']}"}\n',
        '{"time:timestamp": "2023-04-23 15:18:54,204","case:concept:name":"0x286109616e05cb5061ad956b3af40de9","concept:name": "Combine User Data","message": "Combine User Data","tilt": "{\'data_disclosed\': [\'user.firstname\', \'user.lastname\', \'user.birthday\', \'address.street\', \'address.postcode\', \'address.number\'], \'purposes\': [\'personal data access\'

In [11]:
df : pd.DataFrame = pd.DataFrame(fields)

In [12]:
log_df : pd.DataFrame = pd.json_normalize(df["log"].apply(json.loads))

In [13]:
log_df

Unnamed: 0,time:timestamp,case:concept:name,concept:name,message,tilt
0,"2023-04-23 15:18:53,689",0x9a6cd07b69bd1edbc51a6ccbf9bd24fb,Request User Address,Request User Address,"{'data_disclosed': ['user.id'], 'purposes': ['..."
1,"2023-04-23 15:18:54,192",0x286109616e05cb5061ad956b3af40de9,Request User Address,Request User Address,"{'data_disclosed': ['user.id'], 'purposes': ['..."
2,"2023-04-23 15:18:54,204",0x286109616e05cb5061ad956b3af40de9,Combine User Data,Combine User Data,"{'data_disclosed': ['user.firstname', 'user.la..."
3,"2023-04-23 15:18:54,717",0xc3e9ce31b842c7ab78af52b8102e9c1d,Request User Address,Request User Address,"{'data_disclosed': ['user.id'], 'purposes': ['..."
4,"2023-04-23 15:18:54,731",0xc3e9ce31b842c7ab78af52b8102e9c1d,Combine User Data,Combine User Data,"{'data_disclosed': ['user.firstname', 'user.la..."
...,...,...,...,...,...
4167,"2023-04-23 15:21:30,983",0xddaf79a39d7992dfd98be676835e3c71,Send Newsletter,Send Newsletter,"{'data_disclosed': ['user.firstname', 'user.la..."
4168,"2023-04-23 15:21:32,098",0xb1ed0f651cb416815984484c0d16b09a,Send Newsletter,Send Newsletter,"{'data_disclosed': ['user.firstname', 'user.la..."
4169,"2023-04-23 15:21:35,286",0xa9f5b12f6de704e35262bf21d9265c41,Send Newsletter,Send Newsletter,"{'data_disclosed': ['user.firstname', 'user.la..."
4170,"2023-04-23 15:21:35,498",0xc360baaa8d64ef87be2dea15326e45de,Send Newsletter,Send Newsletter,"{'data_disclosed': ['user.firstname', 'user.la..."


In [14]:
tilt_df : pd.DataFrame = pd.json_normalize(
    log_df["tilt"]
    .apply(lambda x: x.replace("\'","\""))
    .apply(json.loads))\
        .add_prefix("tilt:")

In [15]:
combined_df = pd.concat([log_df,tilt_df],axis=1).drop("tilt",axis=1)
combined_df.to_csv("tilt-enhanced-event-log.csv",sep=",")

combined_df.index.name = "ocel:eid"
combined_df["ocel:timestamp"] = combined_df["time:timestamp"].apply(lambda x: x.replace(","," "))
combined_df["ocel:activity"]= combined_df["concept:name"]
combined_df.reset_index().to_json("tilt-enhanced-event-log.json",orient="records",)

In [11]:
with open("tilt-enhanced-event-log.json") as f:
    jo = json.load(f)

In [12]:
ocel = {"ocel:events":{},"ocel:objects":{},"ocel:global-log": {
    "ocel:attribute-names": [
      "label",
      "purpose",
      "basis"
    ],
    "ocel:object-types": [
      "tilt:data_disclosed",
      "tilt:purposes",
      "tilt:legal_bases"
    ],
    "ocel:version": "1.0",
    "ocel:ordering": "timestamp"
  },"ocel:global-event": {
    "ocel:activity": "__INVALID__"
  },
  "ocel:global-object": {
    "ocel:type": "__INVALID__"
  }}
for i in jo:
    ocel["ocel:events"][i["ocel:eid"]] = i
    ocel["ocel:events"][i["ocel:eid"]]["ocel:vmap"] = {}
    ocel["ocel:events"][i["ocel:eid"]]["ocel:omap"] = []
    for t in [k for k in i.keys() if k.startswith("tilt")]:
        ocel["ocel:events"][i["ocel:eid"]]["ocel:vmap"][t] = i[t]
    for j in i["tilt:data_disclosed"]:
        ocel["ocel:events"][i["ocel:eid"]]["ocel:omap"].append(j)
        if not j in ocel["ocel:objects"].keys():
            ocel["ocel:objects"][j]={
                "ocel:type": "tilt:data_disclosed",
                "ocel:ovmap":{"label":j}}
    for j in i["tilt:purposes"]:
        ocel["ocel:events"][i["ocel:eid"]]["ocel:omap"].append(j)
        if not j in ocel["ocel:objects"].keys():
            ocel["ocel:objects"][j]={
                "ocel:type": "tilt:purposes",
                "ocel:ovmap":{"purpose":j}}
    for j in i["tilt:legal_bases"]:
        ocel["ocel:events"][i["ocel:eid"]]["ocel:omap"].append(j)
        if not j in ocel["ocel:objects"].keys():
            ocel["ocel:objects"][j]={
                "ocel:type": "tilt:legal_bases",
                "ocel:ovmap":{"basis":j}}

In [13]:
with open('ocel.json', 'w') as f:
    json.dump(ocel, f)