In [12]:
import json
import glob
from tld import get_tld

In [13]:
NECESSARY_EVENTS =  [["mousemove"],
                    ["click", "mousedown"], 
                    ["keypress", "keydown", "input"], 
                    ["scroll", "wheel"]]

EMAIL_PREFIX = "timthesis40+"
EMAIL_SUFFIX =  "@gmail.com"
FIELD_INPUTS = {'tel': "497413093", 'organization': "Sample Company", 'street-address': "Rue Jean Lorette 169", 'address-line1': "number 89",
                'address-line2': "box 2", 'address-line3': "Thuin", 'address-level2': "Thuin", 'address-level1': "Thuin", 'postal-code': "6530",
                'country': "Belgium", 'cc-name': "Harold Gonzalez", 'name': "Harold", 'given-name': "Harry", 'additional-name': "Jerry",
                'family-name': "Gonzalez", 'cc-number': "4929846523784508", 'cc-exp-month': "02", 'cc-exp-year': "25", 'cc-exp': "02/25",
                'cc-type': "Mastercard", 'username': "cosicadam", 'password': "myPwd1111111111111=", 'default': "Adam"}

STOP_TIME_WINDOW_REQUESTS = 8500
STOP_TIME_WINDOW_APIS = 500

MOVEWIDTHS =  [290, 100, 250, 180, 320, 430]
MOVEHEIGTHS = [420, 310, 330, 230, 300, 130]

In [14]:
def GetEmailFilled(filename):    
    emailUrl = filename

    index = emailUrl.rfind("_")
    if index != -1:
        emailUrl = emailUrl[:index]

    index = emailUrl.find("www.")
    if index != -1:
        emailUrl = emailUrl[index+4:]

    emailUrl = EMAIL_PREFIX + emailUrl + EMAIL_SUFFIX

    # print(emailUrl)
    return emailUrl

# Get the number of characters filled by the crawler
def GetNumberOfFilledCharacters(listOfFilledTypes, filename):
    count = 0
    count2 = 0
    for type in listOfFilledTypes:
        if type == "password":
            count += len(FIELD_INPUTS[type])
        elif type in FIELD_INPUTS:
            count += len(FIELD_INPUTS[type])
            count2 += len(FIELD_INPUTS[type])
        elif type == "email":
            count += len(GetEmailFilled(filename))
            count2 += len(GetEmailFilled(filename))
        else: 
            count += len(FIELD_INPUTS['default'])
            count2 += len(FIELD_INPUTS['default'])
            
    return [count, count2]

def GetFilledFieldsInputs(listOfFilledTypes, filename):
    result = []

    for type in listOfFilledTypes:
        if type in FIELD_INPUTS:
            result.append(FIELD_INPUTS[type])
        elif type == "email":
            result.append(GetEmailFilled(filename))
        else: 
            result.append(FIELD_INPUTS['default'])

    # print(result) 
    return result

In [15]:
# Check if necessary events are present in a set
def CheckEventListenersFromSet(setOfListeners):
    for eventType in NECESSARY_EVENTS:
        eventTypePresent = False

        for event in eventType:
            if event in setOfListeners:
                eventTypePresent = True
                break
        
        if not eventTypePresent:
            return False
        
    return True


# Check if necessary events are present in a domain or script
def CheckEventListenersFromDomain(domainInfo):
    result = {"scriptsContainingAllEvents": [], "domainContainsAllEvents": False}
    combinedEvents = set()

    # Check each url seperatly
    for url in domainInfo:
        urlEvents = domainInfo[url]
        combinedEvents.update(urlEvents)

        eventsPresent = CheckEventListenersFromSet(urlEvents)

        if eventsPresent:
            result["scriptsContainingAllEvents"].append(url)
            result["domainContainsAllEvents"] = True
    
    # If no scripts contains all events, check the entire domain
    if not result["domainContainsAllEvents"]:
        result["domainContainsAllEvents"] = CheckEventListenersFromSet(combinedEvents)

    return result, combinedEvents


# Check if necessary events are present on a page
def CheckEventListeners(srsData):
    result = {"Domains": {}, "pageContainsAllEvents": False}
    combinedEvents = set()

    for domain in srsData:
        domainResult, domainEvents = CheckEventListenersFromDomain(srsData[domain])
        combinedEvents.update(domainEvents)
        
        if domainResult["domainContainsAllEvents"]:
            result["Domains"][domain] = domainResult
            result["pageContainsAllEvents"] = True
        
    # If no domain contains all events, check the entire page
    if not result["pageContainsAllEvents"]:
        result["pageContainsAllEvents"] = CheckEventListenersFromSet(combinedEvents)
    
    return result

In [16]:
# Check the number of API accesses in a certain timeframe
def CheckAPIWithTimestamps(timestamps, filledCharacters, filledFields, startTime, stopTime, window):
    newStopTime = stopTime + window
    count = 0

    for timestamp in timestamps:
        if timestamp >= startTime and timestamp <= newStopTime:
            count += 1
    
    if (count >= filledCharacters[0] and count <= filledCharacters[0] + filledFields) or (count >= filledCharacters[1] and count <= filledCharacters[1] + filledFields):
        return 3
    
    if count >= filledCharacters[0] or (filledCharacters[1] > 0 and count >= filledCharacters[1]):
        return 2
    
    if count > 0:
        return 1
    
    return 0


# Check the number of API accesses in a certain timeframe
def CheckAPIWithoutTimestamps(count, filledCharacters, filledFields):
    if (count >= filledCharacters[0] and count <= filledCharacters[0] + filledFields) or (count >= filledCharacters[1] and count <= filledCharacters[1] + filledFields):
        return 3
    
    if count >= filledCharacters[0] or (filledCharacters[1] > 0 and count >= filledCharacters[1]):
        return 2
    
    if count > 0:
        return 1
    
    return 0


# Check API accesses per script
def CheckAPI(apiData, filledData, filename):
    result = {}

    filledFields = filledData["filledFields"]
    filledCharacters = GetNumberOfFilledCharacters(filledData["listOfFilledTypes"], filename)
    startTime = filledData["startTime"]
    stopTime = filledData["stopTime"]

    if filledFields == 0:
        return result

    for domain in apiData:
        result[domain] = {}
        for url in apiData[domain]:
            urlData = apiData[domain][url]
            timestampResult = CheckAPIWithTimestamps(urlData["timestamps"], filledCharacters, filledFields, startTime, stopTime, STOP_TIME_WINDOW_APIS)
            noTimestampResult = CheckAPIWithoutTimestamps(urlData["count"], filledCharacters, filledFields)
            result[domain][url] = [timestampResult, noTimestampResult]
    
    return result

In [17]:
def CheckRequestsFields(requestData, filledData, filename):
    listOfFilledTypes = filledData["listOfFilledTypes"]
    filledInputs = GetFilledFieldsInputs(listOfFilledTypes, filename)

    startTime = filledData["startTime"]
    stopTime = filledData["stopTime"] + STOP_TIME_WINDOW_REQUESTS

    # {key = domain, value = {count, inputsPresent, urls = {key= url, value ={count, inputspresent}}}}
    result = {}

    for domain in requestData:
        result[domain] = {"count": 0, "inputsPresent": set(), "urls": {}}
        for url in requestData[domain]:
            result[domain]["urls"][url] = {"count": 0, "inputsPresent": set()}

            for request in requestData[domain][url]:
                # print(request)
                # Check timestamp (walltime in seconds, start/stop time in miliseconds)
                if "wallTime" in request:
                    timestamp = request["wallTime"]*1000
                    if timestamp >= startTime:
                        result[domain]["count"] += 1
                        result[domain]["urls"][url]["count"] += 1
                
                # Check if request contains input
                for input in filledInputs:
                    if "postData" in request and input in request["postData"]:
                        result[domain]["inputsPresent"].add(input)
                        result[domain]["urls"][url]["inputsPresent"].add(input)

            if result[domain]["urls"][url] == {"count": 0, "inputsPresent": set()}:
                result[domain]["urls"].pop(url)
    
        if result[domain]["count"] == 0 and result[domain]["inputsPresent"] == set():
            result.pop(domain)
    
    return result


def CheckRequestsMouse(requestData, filledData):
    startTime = filledData["startTime"]
    stopTime = filledData["stopTime"] + STOP_TIME_WINDOW_REQUESTS

    # {key = domain, value = {count, coordinatesPresent, urls = {key= url, value ={count, coordinatesPresent}}}}
    result = {}

    for domain in requestData:
        result[domain] = {"count": 0, "coordinatesPresent": 0, "urls": {}}
        for url in requestData[domain]:
            result[domain]["urls"][url] = {"count": 0, "coordinatesPresent": 0}

            for request in requestData[domain][url]:

                # print(request)
                # Check timestamp (walltime in seconds, start/stop time in miliseconds)
                if "wallTime" in request:
                    timestamp = request["wallTime"]*1000
                    if timestamp >= startTime:
                        result[domain]["count"] += 1
                        result[domain]["urls"][url]["count"] += 1
                
                # Check if request contains input
                for i in range(len(MOVEWIDTHS)):
                    if "postData" in request and str(MOVEWIDTHS[i]) in request["postData"] and str(MOVEHEIGTHS[i]) in request["postData"]:
                        result[domain]["coordinatesPresent"] += 1
                        result[domain]["urls"][url]["coordinatesPresent"] += 1
            
            if result[domain]["urls"][url]["count"] == 0:
                result[domain]["urls"].pop(url)
    
        if result[domain]["count"] == 0:
            result.pop(domain)
    
    return result


def CheckRequests(requestData, filledData, filename):
    filledFields = filledData["filledFields"]

    if filledFields != 0:
        result = CheckRequestsFields(requestData, filledData, filename)
    else:
        result = CheckRequestsMouse(requestData, filledData)

    return result  

In [18]:
filePath = "SortedOutput\\"
outputPath = "Output/crawlResults.json"
failedPath = "Output/failedSites.json"

files = glob.glob(filePath + '*.json', recursive = False)

In [19]:
output = {}
failedSites = []

for file in files:
    # try:
        siteName = file[0:file.rfind("_")]
        siteName = siteName[siteName.find("\\")+1:]
        # print(siteName)

        # Load the data from the .json file
        f = open(file, encoding="utf-8") 
        allData = json.load(f)
        f.close()

        result = {}
        apiData = allData["apis"]
        filledData = allData["filled"]
        requestData = allData["requests"]
        srsData = allData["srs"]

        # Check output
        result["EventListenerResults"] = CheckEventListeners(srsData)
        result["APIResults"] = CheckAPI(apiData, filledData, siteName)
        result["RequestResults"] = CheckRequests(requestData, filledData, siteName)

        output[siteName] = result
    # except:
    #     failedSites.append(file)

# Save output
with open(outputPath, 'w') as fp:
    json.dump(output, fp, default=tuple, sort_keys=True, indent=4)
    fp.close()
    
if len(failedSites) != 0:
    with open(failedPath, 'w') as fp:
        json.dump(failedSites, fp, default=tuple, sort_keys=True, indent=4)
        fp.close()