In [25]:
import json
import glob
from tld import get_tld

filePathFolder = "../data\\"
outputPath = "SortedOutput/"
failedPath = "SortedOutput/failedSites.json"

In [26]:
# Sort the output from event listeners per domain
def sortEventListeners(allData):
    srsData = allData["data"]["srs"][0]

    # {key = domain, value = {key = url, value = {listenerTypes}}}
    sortedListeners = {}

    for listener in srsData:
        if "url" in listener:
            res = get_tld(listener["url"], as_object=True, fail_silently=True)
            if res is None:
                # print(listener["url"])
                continue
            
            # Check if the event listener is set to once
            if listener["once"]:
                continue

            if res.domain not in sortedListeners:
                # domain has no existing entries
                sortedListeners[res.domain] = {listener["url"] : {listener["listenerType"]}}
                continue

            # domain already has entries
            if listener["url"] in sortedListeners[res.domain]:
                # url already has entries, add new listener type
                sortedListeners[res.domain][listener["url"]].add(listener["listenerType"])
            else:
                # url has no entries, add it with the corresponding listener
                sortedListeners[res.domain][listener["url"]] = {listener["listenerType"]}
        
    # print(sortedListeners)
    return sortedListeners       

In [27]:
# Sort the output from requests per initiator domain
def sortRequestsURL(allData):
    requestData = allData["data"]["requests"]

    # {key = domain, value = {key = url, value = [messages]}}
    sortedRequests = {}

    for request in requestData:
        for initiator in request["initiators"]:

            res = get_tld(initiator, as_object=True, fail_silently=True)
            if res is None:
                # print(initiator)
                continue

            if res.domain not in sortedRequests:
                # domain has no existing entries
                sortedRequests[res.domain] = {initiator : [request]}
                continue

            # domain already has entries
            if initiator in sortedRequests[res.domain]:
                # url already has entries, add new listener type
                sortedRequests[res.domain][initiator].append(request)
            else:
                # url has no entries, add it with the corresponding listener
                sortedRequests[res.domain][initiator] = [request]
    
    return sortedRequests

In [28]:
# Sort the output from requests per initiator domain
def sortRequests(allData):
    requestData = allData["data"]["requests"]

    # {key = [initiators], value = [messages]}
    sortedRequests = {}

    for request in requestData:
        initiators = tuple(request["initiators"])

        if initiators in sortedRequests:
            sortedRequests[initiators].append(request)
        else:
            sortedRequests[initiators] = [request]
           
    return sortedRequests

In [29]:
# Sort the output from apis per initiator domain
def sortApis(allData):
    apiData = allData["data"]["apis"]
    callStats = apiData["callStats"]
    inputResults = apiData["inputElementResults"]

    # {key = domain, value = {key = url, value = [count, [timestamps]]}
    sortedApis = {}

    for call in inputResults:
        if "source" in call:
            url = call["source"]
            res = get_tld(url, as_object=True, fail_silently=True)

            if res is None:
                continue

            if res.domain not in sortedApis:
                # domain has no existing entries
                sortedApis[res.domain] = {url : {"count": 1, "timestamps": []}}
                if "timestamp" in call:
                    sortedApis[res.domain][url]["timestamps"].append(call["timestamp"])
                continue

            if url in sortedApis[res.domain]:
                sortedApis[res.domain][url]["count"] += 1
                if "timestamp" in call:
                    sortedApis[res.domain][url]["timestamps"].append(call["timestamp"])
            else:
                sortedApis[res.domain][url] = {"count": 1, "timestamps": []}
                if "timestamp" in call:
                    sortedApis[res.domain][url]["timestamps"].append(call["timestamp"])
    
    return sortedApis

In [30]:
def sortFilled(allData):
    fillData = srsData = allData["data"]["srs"][1]
    return {"filledFields": fillData[0], "startTime": fillData[1], "stopTime": fillData[2], "listOfFilledTypes": fillData[3], "failedFields": fillData[4]}

In [31]:
files = glob.glob(filePathFolder + '*.json', recursive = False)
failedSites = []

In [32]:
for file in files:
    # try:
        if file == filePathFolder + "metadata.json" or file == filePathFolder + "log.txt":
            continue

        outputName = outputPath + file[file.rfind("data\\")+5 :]
        # print("file: ", file, " mapped to: ", outputName)

        # Load the data from the .json file
        f = open(file, encoding="utf-8") 
        allData = json.load(f)
        f.close()

        # Sort output
        finalDict = {}
        finalDict["srs"] = sortEventListeners(allData)
        finalDict["requests"] = sortRequestsURL(allData)
        finalDict["apis"] = sortApis(allData)
        finalDict["filled"] = sortFilled(allData)

        # Save output
        with open(outputName, 'w') as fp:
            json.dump(finalDict, fp, default=tuple, sort_keys=True, indent=4)
            fp.close()
    # except:
    #     failedSites.append(file)

if len(failedPath) != 0:
    with open(failedPath, 'w') as fp:
        json.dump(failedSites, fp, default=tuple, sort_keys=True, indent=4)
        fp.close()