In [1]:
import json
from tld import get_tld

# Declare the file that needs to be loaded
filePath = "SortedOutput/timvlummens.github.io_4ab9.json"
outputPath = "Output/timvlummens.github.io.json"

filePath = "SortedOutput/id90travel.com_f1a5.json"

In [2]:
# Load the data from the .json file
f = open(filePath, encoding="utf-8") 
allData = json.load(f)
f.close()

In [3]:
FinalResult = {}
NECESSARY_EVENTS =  [["mousemove"],
                    ["click", "mousedown"], 
                    ["keypress", "keydown", "input"], 
                    ["scroll", "wheel"]]

EMAIL_PREFIX = "timthesis40+"
EMAIL_SUFFIX =  "@gmail.com"
FIELD_INPUTS = {'tel': "497413093", 'organization': "Sample Company", 'street-address': "Rue Jean Lorette 169", 'address-line1': "number 89",
                'address-line2': "box 2", 'address-line3': "Thuin", 'address-level2': "Thuin", 'address-level1': "Thuin", 'postal-code': "6530",
                'country': "Belgium", 'cc-name': "Harold Gonzalez", 'name': "Harold", 'given-name': "Harry", 'additional-name': "Jerry",
                'family-name': "Gonzalez", 'cc-number': "4929846523784508", 'cc-exp-month': "02", 'cc-exp-year': "25", 'cc-exp': "02/25",
                'cc-type': "Mastercard", 'username': "cosicadam", 'password': "myPwd1111111111111=", 'default': "Adam"}

STOP_TIME_WINDOW_REQUESTS = 8500
STOP_TIME_WINDOW_APIS = 500

MOVEWIDTHS =  [290, 100, 250, 180, 320, 430]
MOVEHEIGTHS = [420, 310, 330, 230, 300, 130]

In [4]:
# Check if necessary events are present in a set
def CheckEventListenersFromSet(setOfListeners):
    for eventType in NECESSARY_EVENTS:
        eventTypePresent = False

        for event in eventType:
            if event in setOfListeners:
                eventTypePresent = True
                break
        
        if not eventTypePresent:
            return False
        
    return True

In [5]:
# Check if necessary events are present in a domain or script
def CheckEventListenersFromDomain(domainInfo):
    result = {"scriptsContainingAllEvents": [], "domainContainsAllEvents": False}
    combinedEvents = set()

    # Check each url seperatly
    for url in domainInfo:
        urlEvents = domainInfo[url]
        combinedEvents.update(urlEvents)

        eventsPresent = CheckEventListenersFromSet(urlEvents)

        if eventsPresent:
            result["scriptsContainingAllEvents"].append(url)
            result["domainContainsAllEvents"] = True
    
    # If no scripts contains all events, check the entire domain
    if not result["domainContainsAllEvents"]:
        result["domainContainsAllEvents"] = CheckEventListenersFromSet(combinedEvents)

    return result, combinedEvents

In [6]:
# Check if necessary events are present on a page
def CheckEventListeners(srsData):
    result = {"Domains": {}, "pageContainsAllEvents": False}
    combinedEvents = set()

    for domain in srsData:
        domainResult, domainEvents = CheckEventListenersFromDomain(srsData[domain])
        combinedEvents.update(domainEvents)
        
        if domainResult["domainContainsAllEvents"]:
            result["Domains"][domain] = domainResult
            result["pageContainsAllEvents"] = True
        
    # If no domain contains all events, check the entire page
    if not result["pageContainsAllEvents"]:
        result["pageContainsAllEvents"] = CheckEventListenersFromSet(combinedEvents)
    
    return result

In [7]:
# Check if necessary events are present

srsData = allData["srs"]
result = CheckEventListeners(srsData)
print(result)
FinalResult["EventListenerResults"] = result


{'Domains': {'id90travel': {'scriptsContainingAllEvents': ['https://www.id90travel.com/polyfills.c946be37bc478fc7.js'], 'domainContainsAllEvents': True}}, 'pageContainsAllEvents': True}


In [8]:
def GetEmailFilled(filename):    
    emailUrl = filename

    index = emailUrl.rfind("_")
    if index != -1:
        emailUrl = emailUrl[:index]

    index = emailUrl.find("www.")
    if index != -1:
        emailUrl = emailUrl[index+4:]

    emailUrl = EMAIL_PREFIX + emailUrl + EMAIL_SUFFIX

    # print(emailUrl)
    return emailUrl

In [21]:
# Get the number of characters filled by the crawler
def GetNumberOfFilledCharacters(listOfFilledTypes, filename):
    count = 0
    count2 = 0
    for type in listOfFilledTypes:
        if type == "password":
            count += len(FIELD_INPUTS[type])
        elif type in FIELD_INPUTS:
            count += len(FIELD_INPUTS[type])
            count2 += len(FIELD_INPUTS[type])
        elif type == "email":
            count += len(GetEmailFilled(filename))
            count2 += len(GetEmailFilled(filename))
            print(GetEmailFilled(filename))
        else: 
            count += len(FIELD_INPUTS['default'])
            count2 += len(FIELD_INPUTS['default'])

    print(count)
    print(count2)        
    return [count, count2]

In [10]:
# Check the number of API accesses in a certain timeframe
def CheckAPIWithTimestamps(timestamps, filledCharacters, filledFields, startTime, stopTime, window):
    newStopTime = stopTime + window
    count = 0

    for timestamp in timestamps:
        if timestamp >= startTime and timestamp <= newStopTime:
            count += 1
    
    if (count >= filledCharacters[0] and count <= filledCharacters[0] + filledFields) or (count >= filledCharacters[1] and count <= filledCharacters[1] + filledFields):
        return 3
    
    if count >= filledCharacters[0] or (filledCharacters[1] > 0 and count >= filledCharacters[1]):
        return 2
    
    if count > 0:
        return 1
    
    return 0

In [11]:
# Check the number of API accesses in a certain timeframe
def CheckAPIWithoutTimestamps(count, filledCharacters, filledFields):
    if (count >= filledCharacters[0] and count <= filledCharacters[0] + filledFields) or (count >= filledCharacters[1] and count <= filledCharacters[1] + filledFields):
        return 3
    
    if count >= filledCharacters[0] or (filledCharacters[1] > 0 and count >= filledCharacters[1]):
        return 2
    
    if count > 0:
        return 1
    
    return 0

In [12]:
# Check API accesses per script
def CheckAPI(apiData, filledData, filename):
    result = {}

    filledFields = filledData["filledFields"]
    filledCharacters = GetNumberOfFilledCharacters(filledData["listOfFilledTypes"], filename)
    startTime = filledData["startTime"]
    stopTime = filledData["stopTime"]

    if filledFields == 0:
        return result

    for domain in apiData:
        result[domain] = {}
        for url in apiData[domain]:
            urlData = apiData[domain][url]
            timestampResult = CheckAPIWithTimestamps(urlData["timestamps"], filledCharacters, filledFields, startTime, stopTime, STOP_TIME_WINDOW_APIS)
            noTimestampResult = CheckAPIWithoutTimestamps(urlData["count"], filledCharacters, filledFields)
            result[domain][url] = [timestampResult, noTimestampResult]
    
    return result
    

In [22]:
# Check if API accesses were made for the input fields by scripts with required events

apiData = allData["apis"]
filledData = allData["filled"]

filename = filePath[filePath.rfind("SortedOutput\\")+len("SortedOutput\\")+1 :]
# print(filename)

result = CheckAPI(apiData, filledData, filename)
print(result)
FinalResult["APIResults"] = result

timthesis40+id90travel.com@gmail.com
55
36
{'facebook': {'https://connect.facebook.net/signals/config/1672163416384929?v=2.9.102&r=stable': [1, 1]}, 'id90travel': {'https://www.id90travel.com/main.18fa24010628e45f.js': [3, 3]}, 'smartlook': {'https://web-sdk.smartlook.com/es6/bundle.92f91e7a4f112b8cd064.js': [2, 2]}}


In [14]:
def GetFilledFieldsInputs(listOfFilledTypes, filename):
    result = []

    for type in listOfFilledTypes:
        if type in FIELD_INPUTS:
            result.append(FIELD_INPUTS[type])
        elif type == "email":
            result.append(GetEmailFilled(filename))
        else: 
            result.append(FIELD_INPUTS['default'])

    # print(result) 
    return result

In [15]:
def CheckRequestsFields(requestData, filledData, filename):
    listOfFilledTypes = filledData["listOfFilledTypes"]
    filledInputs = GetFilledFieldsInputs(listOfFilledTypes, filename)

    startTime = filledData["startTime"]
    stopTime = filledData["stopTime"] + STOP_TIME_WINDOW_REQUESTS

    # {key = domain, value = {count, inputsPresent, urls = {key= url, value ={count, inputspresent}}}}
    result = {}

    for domain in requestData:
        result[domain] = {"count": 0, "inputsPresent": set(), "urls": {}}
        for url in requestData[domain]:
            result[domain]["urls"][url] = {"count": 0, "inputsPresent": set()}

            for request in requestData[domain][url]:
                # print(request)
                # Check timestamp (walltime in seconds, start/stop time in miliseconds)
                if "wallTime" in request:
                    timestamp = request["wallTime"]*1000
                    if timestamp >= startTime:
                        result[domain]["count"] += 1
                        result[domain]["urls"][url]["count"] += 1
                
                # Check if request contains input
                for input in filledInputs:
                    if "postData" in request and input in request["postData"]:
                        result[domain]["inputsPresent"].add(input)
                        result[domain]["urls"][url]["inputsPresent"].add(input)

            if result[domain]["urls"][url] == {"count": 0, "inputsPresent": set()}:
                result[domain]["urls"].pop(url)
    
        if result[domain]["count"] == 0 and result[domain]["inputsPresent"] == set():
            result.pop(domain)
    
    return result


In [16]:
def CheckRequestsMouse(requestData, filledData):
    startTime = filledData["startTime"]
    stopTime = filledData["stopTime"] + STOP_TIME_WINDOW_REQUESTS

    # {key = domain, value = {count, coordinatesPresent, urls = {key= url, value ={count, coordinatesPresent}}}}
    result = {}

    for domain in requestData:
        result[domain] = {"count": 0, "coordinatesPresent": 0, "urls": {}}
        for url in requestData[domain]:
            result[domain]["urls"][url] = {"count": 0, "coordinatesPresent": 0}

            for request in requestData[domain][url]:

                # print(request)
                # Check timestamp (walltime in seconds, start/stop time in miliseconds)
                if "wallTime" in request:
                    timestamp = request["wallTime"]*1000
                    if timestamp >= startTime:
                        result[domain]["count"] += 1
                        result[domain]["urls"][url]["count"] += 1
                
                # Check if request contains input
                for i in range(len(MOVEWIDTHS)):
                    if "postData" in request and str(MOVEWIDTHS[i]) in request["postData"] and str(MOVEHEIGTHS[i]) in request["postData"]:
                        result[domain]["coordinatesPresent"] += 1
                        result[domain]["urls"][url]["coordinatesPresent"] += 1
            
            if result[domain]["urls"][url] == {"count": 0, "coordinatesPresent": 0}:
                result[domain]["urls"].pop(url)
    
        if result[domain]["count"] == 0 and result[domain]["coordinatesPresent"] == 0:
            result.pop(domain)
    
    return result

In [17]:
def CheckRequests(requestData, filledData, filename):
     filledFields = filledData["filledFields"]

     if filledFields != 0:
          result = CheckRequestsFields(requestData, filledData, filename)
     else:
          result = CheckRequestsMouse(requestData, filledData)

     return result   


In [18]:
requestData = allData["requests"]
filledData = allData["filled"]

filename = filePath[filePath.rfind("SortedOutput\\")+len("SortedOutput\\")+1 :]
# print(filename)

result = CheckRequests(requestData, filledData, filename)
print(result)
FinalResult["RequestResults"] = result

{'id90travel': {'count': 24, 'inputsPresent': set(), 'urls': {'https://www.id90travel.com/main.18fa24010628e45f.js': {'count': 10, 'inputsPresent': set()}, 'https://www.id90travel.com/polyfills.c946be37bc478fc7.js': {'count': 14, 'inputsPresent': set()}}}, 'instana': {'count': 14, 'inputsPresent': set(), 'urls': {'https://eum.instana.io/eum.min.js': {'count': 14, 'inputsPresent': set()}}}, 'smartlook': {'count': 15, 'inputsPresent': set(), 'urls': {'https://web-sdk.smartlook.com/es6/bundle.92f91e7a4f112b8cd064.js': {'count': 6, 'inputsPresent': set()}, 'https://web-sdk.smartlook.com/es6/init.832b29f132c1628fc0e1.js': {'count': 9, 'inputsPresent': set()}}}}
