In [1]:
import json
import glob
from tld import get_tld

In [2]:
NECESSARY_EVENTS =  [["mousemove"],
                    ["click", "mousedown", "mouseup"], 
                    ["keypress", "keydown", "keyup", "input"], 
                    ["scroll", "wheel"]]

EMAIL_PREFIX = "timthesis40+"
EMAIL_SUFFIX =  "@gmail.com"
FIELD_INPUTS = {'tel': "497413093", 'organization': "Sample Company", 'street-address': "Rue Jean Lorette 169", 'address-line1': "number 89",
                'address-line2': "box 2", 'address-line3': "Thuin", 'address-level2': "Thuin", 'address-level1': "Thuin", 'postal-code': "6530",
                'country': "Belgium", 'cc-name': "Harold Gonzalez", 'name': "Harold", 'given-name': "Harry", 'additional-name': "Jerry",
                'family-name': "Gonzalez", 'cc-number': "4929846523784508", 'cc-exp-month': "02", 'cc-exp-year': "25", 'cc-exp': "02/25",
                'cc-type': "Mastercard", 'username': "cosicadam", 'password': "myPwd1111111111111=", 'default': "Adam"}

STOP_TIME_WINDOW_REQUESTS = 8500
STOP_TIME_WINDOW_APIS = 500

START_TIME_WINDOW_REQUESTS = 0

MOVEWIDTHS =  [290, 100, 250, 180, 320, 430]
MOVEHEIGTHS = [420, 310, 330, 230, 300, 130]

In [3]:
def sortFilled(allData):
    fillData = srsData = allData["data"]["srs"][1]
    return {"filledFields": fillData[0], "startTime": fillData[1], "stopTime": fillData[2], "listOfFilledTypes": fillData[3], "failedFields": fillData[4]}

In [4]:
def GetEmailFilled(filename):    
    emailUrl = filename

    index = emailUrl.rfind("_")
    if index != -1:
        emailUrl = emailUrl[:index]

    index = emailUrl.find("www.")
    if index != -1:
        emailUrl = emailUrl[index+4:]

    emailUrl = EMAIL_PREFIX + emailUrl + EMAIL_SUFFIX

    # print(emailUrl)
    return emailUrl

# Get the number of characters filled by the crawler
def GetNumberOfFilledCharacters(listOfFilledTypes, filename):
    count = 0
    count2 = 0
    for type in listOfFilledTypes:
        if type == "password":
            count += len(FIELD_INPUTS[type])
        elif type in FIELD_INPUTS:
            count += len(FIELD_INPUTS[type])
            count2 += len(FIELD_INPUTS[type])
        elif type == "email":
            count += len(GetEmailFilled(filename))
            count2 += len(GetEmailFilled(filename))
        else: 
            count += len(FIELD_INPUTS['default'])
            count2 += len(FIELD_INPUTS['default'])
            
    return [count, count2]

def GetFilledFieldsInputs(listOfFilledTypes, filename):
    result = []

    for type in listOfFilledTypes:
        if type in FIELD_INPUTS:
            result.append(FIELD_INPUTS[type])
        elif type == "email":
            result.append(GetEmailFilled(filename))
        else: 
            result.append(FIELD_INPUTS['default'])

    # print(result) 
    return result

In [5]:
# Sort the output from event listeners per domain
def sortEventListeners(allData):
    srsData = allData["data"]["srs"][0]

    # {key = domain, value = {key = url, value = {listenerTypes}}}
    sortedListeners = {}

    for listener in srsData:
        if "url" in listener:
            res = get_tld(listener["url"], as_object=True, fail_silently=True)
            if res is None:
                # print(listener["url"])
                continue
            
            # Check if the event listener is set to once
            if listener["once"]:
                continue

            if res.domain not in sortedListeners:
                # domain has no existing entries
                sortedListeners[res.domain] = {listener["url"] : {listener["listenerType"]}}
                continue

            # domain already has entries
            if listener["url"] in sortedListeners[res.domain]:
                # url already has entries, add new listener type
                sortedListeners[res.domain][listener["url"]].add(listener["listenerType"])
            else:
                # url has no entries, add it with the corresponding listener
                sortedListeners[res.domain][listener["url"]] = {listener["listenerType"]}
        
    # print(sortedListeners)
    return sortedListeners  

In [6]:
# Check if necessary events are present in a set
def CheckEventListenersFromSet(setOfListeners):
    for eventType in NECESSARY_EVENTS:
        eventTypePresent = False

        for event in eventType:
            if event in setOfListeners:
                eventTypePresent = True
                break
        
        if not eventTypePresent:
            return False
        
    return True


# Check if necessary events are present in a domain or script
def CheckEventListenersFromDomain(domainInfo):
    result = {"scriptsContainingAllEvents": [], "domainContainsAllEvents": False}
    combinedEvents = set()

    # Check each url seperatly
    for url in domainInfo:
        urlEvents = domainInfo[url]
        combinedEvents.update(urlEvents)

        eventsPresent = CheckEventListenersFromSet(urlEvents)

        if eventsPresent:
            result["scriptsContainingAllEvents"].append(url)
            result["domainContainsAllEvents"] = True
    
    # If no scripts contains all events, check the entire domain
    if not result["domainContainsAllEvents"]:
        result["domainContainsAllEvents"] = CheckEventListenersFromSet(combinedEvents)

    return result, combinedEvents


# Check if necessary events are present on a page
def CheckEventListeners(srsData):
    result = {"Domains": {}, "pageContainsAllEvents": False}
    combinedEvents = set()

    for domain in srsData:
        domainResult, domainEvents = CheckEventListenersFromDomain(srsData[domain])
        combinedEvents.update(domainEvents)
        
        if domainResult["domainContainsAllEvents"]:
            result["Domains"][domain] = domainResult
            result["pageContainsAllEvents"] = True
        
    # If no domain contains all events, check the entire page
    if not result["pageContainsAllEvents"]:
        result["pageContainsAllEvents"] = CheckEventListenersFromSet(combinedEvents)
    
    return result

In [7]:
# Sort the output from apis per initiator domain
def sortApis(allData):
    apiData = allData["data"]["apis"]
    callStats = apiData["callStats"]
    inputResults = apiData["inputElementResults"]

    # {key = domain, value = {key = url, value = [count, [timestamps]]}
    sortedApis = {}

    for call in inputResults:
        if "source" in call:
            url = call["source"]
            res = get_tld(url, as_object=True, fail_silently=True)

            if res is None:
                continue

            if res.domain not in sortedApis:
                # domain has no existing entries
                sortedApis[res.domain] = {url : {"count": 1, "timestamps": []}}
                if "timestamp" in call:
                    sortedApis[res.domain][url]["timestamps"].append(call["timestamp"])
                continue

            if url in sortedApis[res.domain]:
                sortedApis[res.domain][url]["count"] += 1
                if "timestamp" in call:
                    sortedApis[res.domain][url]["timestamps"].append(call["timestamp"])
            else:
                sortedApis[res.domain][url] = {"count": 1, "timestamps": []}
                if "timestamp" in call:
                    sortedApis[res.domain][url]["timestamps"].append(call["timestamp"])
    
    return sortedApis

In [8]:
# Check the number of API accesses in a certain timeframe
def CheckAPIWithTimestamps(timestamps, filledCharacters, filledFields, startTime, stopTime, window):
    newStopTime = stopTime + window
    count = 0

    for timestamp in timestamps:
        if timestamp >= startTime and timestamp <= newStopTime:
            count += 1
    
    if (count >= filledCharacters[0] and count <= filledCharacters[0] + filledFields) or (count >= filledCharacters[1] and count <= filledCharacters[1] + filledFields):
        return 3
    
    if count >= filledCharacters[0] or (filledCharacters[1] > 0 and count >= filledCharacters[1]):
        return 2
    
    if count > 0:
        return 1
    
    return 0


# Check the number of API accesses in a certain timeframe
def CheckAPIWithoutTimestamps(count, filledCharacters, filledFields):
    if (count >= filledCharacters[0] and count <= filledCharacters[0] + filledFields) or (count >= filledCharacters[1] and count <= filledCharacters[1] + filledFields):
        return 3
    
    if count >= filledCharacters[0] or (filledCharacters[1] > 0 and count >= filledCharacters[1]):
        return 2
    
    if count > 0:
        return 1
    
    return 0


# Check API accesses per script
def CheckAPI(apiData, filledData, filename):
    result = {}

    filledFields = filledData["filledFields"]
    filledCharacters = GetNumberOfFilledCharacters(filledData["listOfFilledTypes"], filename)
    startTime = filledData["startTime"]
    stopTime = filledData["stopTime"]

    if filledFields == 0:
        return result

    for domain in apiData:
        result[domain] = {}
        for url in apiData[domain]:
            urlData = apiData[domain][url]
            timestampResult = CheckAPIWithTimestamps(urlData["timestamps"], filledCharacters, filledFields, startTime, stopTime, STOP_TIME_WINDOW_APIS)
            noTimestampResult = CheckAPIWithoutTimestamps(urlData["count"], filledCharacters, filledFields)
            result[domain][url] = [timestampResult, noTimestampResult]
    
    return result

In [9]:
# Sort the output from requests per initiator domain
def sortRequests(allData, siteDomain):
    requestData = allData["data"]["requests"]

    # {key = [initiators], value = [messages]}
    sortedRequests = {}

    for request in requestData:
        # Eliminate first party requests
        if "url" in request:
            res_url = get_tld(request["url"], as_object=True, fail_silently=True)
            if res_url is not None and res_url.domain == siteDomain:
                continue

        # # Eliminate all requests apart from POST and WebsocketFrame
        # if "method" not in request or request["method"] != "POST":
        #     if "type" not in request or request["type"] != "WebsocketFrame":
        #         continue


        initiators = tuple(request["initiators"])

        if initiators in sortedRequests:
            sortedRequests[initiators].append(request)
        else:
            sortedRequests[initiators] = [request]
           
    return sortedRequests

In [10]:
def CheckRequestsFields(requestData, filledData, filename):
    listOfFilledTypes = filledData["listOfFilledTypes"]
    filledInputs = GetFilledFieldsInputs(listOfFilledTypes, filename)

    startTime = filledData["startTime"] - START_TIME_WINDOW_REQUESTS
    stopTime = filledData["stopTime"] + STOP_TIME_WINDOW_REQUESTS

    # [{count, inputspresent, initiators, initiator domains}, ]
    result = []

    for initiators in requestData:
        # result.append({"count": 0, "inputsPresent": set(), "urls": initiators, "domains": set()})
        temp = {"count": 0, "inputsPresent": set(), "urls": initiators, "domains": set()}

        for request in requestData[initiators]:
            # print(request)
            # Check timestamp (walltime in seconds, start/stop time in miliseconds)
            if "wallTime" in request:
                timestamp = request["wallTime"]*1000
                if timestamp >= startTime:
                    temp["count"] += 1
            
            # Check if request contains input
            for input in filledInputs:
                if "postData" in request and input in request["postData"]:
                    temp["inputsPresent"].add(input)

        for initiator in initiators:
            res = get_tld(initiator, as_object=True, fail_silently=True)
            if res is not None:
                temp["domains"].add(res.domain)

        if temp["count"] != 0 and temp["domains"] != set():
            result.append(temp)
    
    return result


def CheckRequestsMouse(requestData, filledData):
    startTime = filledData["startTime"] - START_TIME_WINDOW_REQUESTS
    stopTime = filledData["stopTime"] + STOP_TIME_WINDOW_REQUESTS

    # [{count, coordinatesPresent, initiators, initiator domains}, ]
    result = []

    for initiators in requestData:
        # result.append({"count": 0, "coordinatesPresent": 0, "urls": initiators, "domains": set()})
        temp = {"count": 0, "coordinatesPresent": 0, "urls": initiators, "domains": set()}

        for request in requestData[initiators]:

            # print(request)
            # Check timestamp (walltime in seconds, start/stop time in miliseconds)
            if "wallTime" in request:
                timestamp = request["wallTime"]*1000
                if timestamp >= startTime:
                    temp["count"] += 1
            
            # Check if request contains input
            for i in range(len(MOVEWIDTHS)):
                if "postData" in request and str(MOVEWIDTHS[i]) in request["postData"] and str(MOVEHEIGTHS[i]) in request["postData"]:
                    temp["coordinatesPresent"] += 1
        
        for initiator in initiators:
            res = get_tld(initiator, as_object=True, fail_silently=True)
            if res is not None:
                temp["domains"].add(res.domain)
        
        if temp["count"] != 0 and temp["domains"] != set():
            result.append(temp)
    
    return result


def CheckRequests(requestData, filledData, filename):
    filledFields = filledData["filledFields"]

    if filledFields != 0:
        result = CheckRequestsFields(requestData, filledData, filename)
    else:
        result = CheckRequestsMouse(requestData, filledData)

    return result  

In [11]:
# filePath = "../data\\"
# outputPath = "SortedOutput2/"
# failedPath = "SortedOutput2/failedSites.json"

filePath = "../../Thesis-srs-crawl-results/data\\"
outputPath = "../../Thesis-srs-crawl-results/SortedOutput/"
failedPath = "../../Thesis-srs-crawl-results/SortedOutput/failedSites.json"

# filePath = "../../Thesis-srs-crawl-results/data\\"
# outputPath = "../../Thesis-srs-crawl-results/SortedOutput_POST/"
# failedPath = "../../Thesis-srs-crawl-results/SortedOutput_POST/failedSites.json"

files = glob.glob(filePath + '*.json', recursive = False)

In [12]:
output = {}
failedSites = []

httpsPrefix = "https://"

for file in files:
    try:
        if file == filePath + "metadata.json" or file == filePath + "log.txt":
            continue


        siteName = file[0:file.rfind("_")]
        siteName = siteName[siteName.find("\\")+1:]
        # print(siteName)

        outputName = outputPath + siteName + ".json"
        # print("file: ", file, " mapped to: ", outputName)

        # Load the data from the .json file
        f = open(file, encoding="utf-8") 
        allData = json.load(f)
        f.close()

        siteDomain = None
        res = get_tld(httpsPrefix + siteName, as_object=True, fail_silently=True)
        if res is not None:
            siteDomain = res.domain

        result = {}
        apiData = sortApis(allData)
        filledData = sortFilled(allData)
        requestData = sortRequests(allData, siteDomain)
        srsData = sortEventListeners(allData)

        # Check output
        result["EventListenerResults"] = CheckEventListeners(srsData)
        result["APIResults"] = CheckAPI(apiData, filledData, siteName)
        result["RequestResults"] = CheckRequests(requestData, filledData, siteName)

        # Save output
        with open(outputName, 'w') as fp:
            json.dump(result, fp, default=tuple, sort_keys=True, indent=4)
            fp.close()
    except:
        failedSites.append(file)
    
if len(failedSites) != 0:
    with open(failedPath, 'w') as fp:
        json.dump(failedSites, fp, default=tuple, sort_keys=True, indent=4)
        fp.close()