In [1]:
import json
import csv
import glob
from tld import get_tld
from tld import get_fld

In [2]:
EMAIL_PREFIX = "timthesis40+"
EMAIL_SUFFIX =  "@gmail.com"
OG_FIELD_INPUTS = {'tel': "497413093", 'organization': "Sample Company", 'street-address': "Rue Jean Lorette 169", 'address-line1': "number 89",
                'address-line2': "box 2", 'address-line3': "Thuin", 'address-level2': "Thuin", 'address-level1': "Thuin", 'postal-code': "6530",
                'country': "Belgium", 'cc-name': "Harold Gonzalez", 'name': "Harold", 'given-name': "Harry", 'additional-name': "Jerry",
                'family-name': "Gonzalez", 'cc-number': "4929846523784508", 'cc-exp-month': "02", 'cc-exp-year': "25", 'cc-exp': "02/25",
                'cc-type': "Mastercard", 'username': "cosicadam", 'password': "myPwd1111111111111=", 'default': "Adam"}
FIELD_INPUTS_NO_NUMBERS = {'tel': "497413093", 'organization': "Sample Company", 'street-address': "Rue Jean Lorette 169", 'address-line1': "number 89",
                'address-line2': "box 2", 'address-line3': "Thuin", 'address-level2': "Thuin", 'address-level1': "Thuin", 'country': "Belgium",
                'cc-name': "Harold Gonzalez", 'name': "Harold", 'given-name': "Harry", 'additional-name': "Jerry", 'family-name': "Gonzalez",
                'cc-number': "4929846523784508", 'cc-type': "Mastercard", 'username': "cosicadam", 'password': "myPwd1111111111111=", 'default': "Adam"}

FIELD_INPUTS_NO_NUMBERS_INVERSE = {"497413093": 'tel', "Sample Company": 'organization', "Rue Jean Lorette 169": 'street-address', "number 89": 'address-line1',
                "box 2": 'address-line2', "Thuin": 'address-level1', "Belgium": 'country', "Harold Gonzalez": 'cc-name', "Harold": 'name', "Harry": 'given-name',
                "Jerry": 'additional-name', "Gonzalez": 'family-name', "4929846523784508": 'cc-number', "Mastercard": 'cc-type', "cosicadam": 'username', "myPwd1111111111111=": 'password', "Adam": 'default'}

REQUEST_TRESHHOLD = 1

In [3]:
OUTPUT_MAPPING = {0: "Not present", 1: "No single compbination", 2: "Combined presence, no input", 3: "Combined presence, input", 4: "Single domain, no input", 5: "Single domain, input"}

In [4]:
def StatusEventListeners(eventListenersResult):
    if not eventListenersResult["pageContainsAllEvents"]:
        # Page doesn't have required event listeners, likelyhood of prescense low
        return 0, [], []
    
    if len(eventListenersResult["Domains"]) == 0:
        # The page has the eventlisteners spread over all domains, not likely this would be the case
        return 1, [], []
    
    scriptsContainingAll = []
    domains = list(eventListenersResult["Domains"].keys())

    for domain in eventListenersResult["Domains"]:
        scriptsContainingAll += eventListenersResult["Domains"][domain]["scriptsContainingAllEvents"]

    if len(scriptsContainingAll) == 0:
        # No single script has all listeners, srs possible, but less likely then if one script had all listeners
        return 2, domains, []
    
    return 3, domains, scriptsContainingAll

In [5]:
def StatusAPI(apiResult):
    if len(apiResult) == 0:
        # No fields filled
        return -1, [], []
    
    domains = set()
    urls = set()
    result = 0

    for domain in apiResult:
        for url in apiResult[domain]:
            if apiResult[domain][url][1] > result:
                result = apiResult[domain][url][1]
            if apiResult[domain][url][1] >= 2:
                domains.add(domain)
                urls.add(url)

    # if len(domains) != 0:
    #     return result, list(domains), list(urls)
    
    # for domain in apiResult:
    #     for url in apiResult[domain]:
    #         if apiResult[domain][url][1] > result:
    #             result = apiResult[domain][url][1]
    #         if apiResult[domain][url][1] == 1:
    #             domains.add(domain)
    #             urls.add(url)
    
    return result, list(domains), list(urls)

In [6]:
def StatusRequests(requestResult, eventDomains, eventScripts):
    if len(requestResult) == 0:
        return 0, [], []
    
    domainMatched = []
    scriptMatched = []
    inputsPresent = set()

    for initiatorResult in requestResult:
        if "inputsPresent" in initiatorResult.keys():
            inputsPresent.update(initiatorResult["inputsPresent"])

        if initiatorResult["count"] < REQUEST_TRESHHOLD:
            continue

        for domain in eventDomains:
            # Check if a domain overlap exists between requests and events
            if domain in initiatorResult["domains"]:
                for script in eventScripts:
                    # Check if a scrips overlap exists between requests and events
                    if script in initiatorResult["urls"]:
                        scriptMatched.append(initiatorResult)
    
    if len(scriptMatched) == 0:
        return 1, [], list(inputsPresent)
    
    return 2, scriptMatched, list(inputsPresent)

In [7]:
def CombineResults(eventStatus, eventDomains, eventScripts, apiStatus, apiDomains, apiScripts, requestStatus, requestScripts):

    if requestStatus == 1 and eventStatus > 1:
            # Requests found, but no match
            return 1, [], [], [], []
    
    # Match in script between event listeners and requests

    # Inputs were filled
    if apiStatus != -1:
        scripts = set()
        domains = set()
        single_scripts = set()
        single_domains = set()
        result = 3
        # probable script the last in initiator chain?
        for initiatorResult in requestScripts:
            script = initiatorResult["urls"][-1]
            if script == "__puppeteer_evaluation_script__":
                if len(initiatorResult["urls"]) > 1:
                    script = initiatorResult["urls"][-2]  

            scripts.add(script)
            res = get_tld(script, as_object=True, fail_silently=True)
            if res is not None:
                domains.add(res.domain)
                # Check if final sending domain is the one with the events
                if res.domain in eventDomains and res.domain in apiDomains:
                    result = 5
                    single_scripts.add(script)
                    single_domains.add(res.domain)

        return result, list(domains), list(scripts), list(single_domains), list(single_scripts)

    # No inputs were filled
    else:
        scripts = set()
        domains = set()
        single_scripts = set()
        single_domains = set()
        result = 2
        # probable script the last in initiator chain?
        for initiatorResult in requestScripts:
            script = initiatorResult["urls"][-1]
            if script == "__puppeteer_evaluation_script__":
                if len(initiatorResult["urls"]) > 1:
                    script = initiatorResult["urls"][-2]            

            scripts.add(script)
            res = get_tld(script, as_object=True, fail_silently=True)
            if res is not None:
                domains.add(res.domain)
                # Check if final sending domain is the one with the events
                if res.domain in eventDomains:
                    result = 4
                    single_scripts.add(script)
                    single_domains.add(res.domain)

        return result, list(domains), list(scripts), list(single_domains), list(single_scripts)

In [8]:
# Return likelyhood, identified domains, scripts and the eventlisteners status
def CheckSRSLikelyhood(siteData):
    eventStatus, eventDomains, eventScripts = StatusEventListeners(siteData["EventListenerResults"])
    if eventStatus != 3:
        return 0, [], [], 0, [], [], []
    
    requestStatus, requestScripts, inputsPresent = StatusRequests(siteData["RequestResults"], eventDomains, eventScripts)
    if requestStatus == 0:
        return 0, [], [], eventStatus, [], [], []

    apiStatus, apiDomains, apiScripts = StatusAPI(siteData["APIResults"])    

    result, domains, scripts, single_domains, single_scripts = CombineResults(eventStatus, eventDomains, eventScripts, apiStatus, apiDomains, apiScripts, requestStatus, requestScripts)
    return result, domains, scripts, eventStatus, inputsPresent, single_domains, single_scripts

In [9]:
def HasInput(apiData):
    if len(apiData) == 0:
        return 0
    else:
        return 1

In [10]:
# filePath = "SortedOutput2\\"
# outputPath = "Output/results.csv"
# outputPathCounts = "Output/resultsCounts.json"
# outputPathDomains = "Output/resultsDomains.csv"
# failedPath = "Output/failedSites.json"

filePath = "../../Thesis-srs-crawl-results/SortedOutput\\"
outputPath = "../../Thesis-srs-crawl-results/Output/results.csv"
outputPathCounts = "../../Thesis-srs-crawl-results/Output/resultsCounts.json"
outputPathDomains = "../../Thesis-srs-crawl-results/Output/resultsDomains.csv"
failedPath = "../../Thesis-srs-crawl-results/Output/failedSites.json"

outputPathDomainsSeperate = "../../Thesis-srs-crawl-results/Output/resultsSeperate"

# filePath = "../../Thesis-srs-crawl-results/SortedOutput_POST\\"
# outputPath = "../../Thesis-srs-crawl-results/Output_POST/results.csv"
# outputPathCounts = "../../Thesis-srs-crawl-results/Output_POST/resultsCounts.json"
# outputPathDomains = "../../Thesis-srs-crawl-results/Output_POST/resultsDomains.csv"
# failedPath = "../../Thesis-srs-crawl-results/Output_POST/failedSites.json"

# outputPathDomainsSeperate = "../../Thesis-srs-crawl-results/Output_POST/resultsSeperate"

files = glob.glob(filePath + '*.json', recursive = False)

In [11]:
output = {}
failedSites = []

headers = ["siteName", "result", "singleDomains", "singleScripts", "domains", "scripts", "eventStatus", "inputsPresent"]
httpsPrefix = "https://"

count_total = 0
count_with = 0
count_without = 0
count_2 = 0
count_3 = 0
count_4 = 0
count_5 = 0
count_events = 0
count_inputs = 0
domain_counts = {}

domain_separate = {2: {}, 3: {}, 4: {}, 5: {}, 4.5: {}, 5.5: {}}

with open(outputPath, 'w') as f:
    writer = csv.DictWriter(f, fieldnames=headers)
    writer.writeheader()
    for file in files:
        try:
            if file == filePath + "failedSites.json":
                continue

            siteName = file[0:file.rfind(".")]
            siteName = siteName[siteName.find("\\")+1:]
            # print(siteName)

            siteDomain = None
            res = get_tld(httpsPrefix + siteName, as_object=True, fail_silently=True)
            if res is not None:
                siteDomain = res.domain

            # Load the data from the .json file
            f = open(file, encoding="utf-8") 
            allData = json.load(f)
            f.close()

            result, domains, scripts, eventStatus, inputsPresent, single_domains, single_scripts = CheckSRSLikelyhood(allData)

            # Remove first party domains and scripts
            # if siteDomain in domains:
            #     domains.remove(siteDomain)
            #     # print("removed: ", siteDomain)
            domains = set()

            scripts_copy = scripts.copy()
            for script in scripts_copy:
                res = get_tld(script, as_object=True, fail_silently=True)
                res2 = get_fld(script, fail_silently=True)
                if res is not None and res.domain == siteDomain:
                    scripts.remove(script)
                elif res2 is not None:
                    domains.add(res2)
            
            domains = list(domains)
                
            # if siteDomain in single_domains:
            #     single_domains.remove(siteDomain)
            #     # print("removed: ", siteDomain)
            single_domains = set()

            single_scripts_copy = single_scripts.copy()
            for script in single_scripts_copy:
                res = get_tld(script, as_object=True, fail_silently=True)
                res2 = get_fld(script, fail_silently=True)
                if res is not None and res.domain == siteDomain:
                    single_scripts.remove(script)
                elif res2 is not None:
                    single_domains.add(res2)
                
            single_domains = list(single_domains)
                
            # If only single domain is first party, switch result to combined
            if (result == 4 or result == 5) and single_domains == []:
                result = result - 2

            # output[siteName] = {"siteName": siteName, "result": result, "domains": domains, "scripts": scripts, "eventStatus": eventStatus}
            row = {"siteName": siteName, "result": result, "singleDomains": str(single_domains), "singleScripts": str(single_scripts), "domains": str(domains), "scripts": str(scripts), "eventStatus": eventStatus, "inputsPresent": inputsPresent}
            writer.writerow(row)

            if result == 2:
                count_2 += 1
            if result == 3:
                count_3 += 1
            if result == 4:
                count_4 += 1
            if result == 5:
                count_5 += 1
            
            if eventStatus == 3:
                count_events += 1

            if len(inputsPresent) != 0:
                for input in inputsPresent:
                    if input in FIELD_INPUTS_NO_NUMBERS_INVERSE or EMAIL_SUFFIX in input:
                        count_inputs += 1
                        break
            
            count_total += 1
            count_with += HasInput(allData["APIResults"])

            for domain in domains:
                if domain in domain_counts:
                    domain_counts[domain] += 1
                else:
                    domain_counts[domain] = 1

                if domain in domain_separate[result]:
                    domain_separate[result][domain] += 1
                else:
                    domain_separate[result][domain] = 1
            
            if result >= 4:
                for domain in single_domains:
                    if domain in domain_separate[result+0.5]:
                        domain_separate[result+0.5][domain] += 1
                    else:
                        domain_separate[result+0.5][domain] = 1
        except:
            failedSites.append(file)

# with open(outputPath, 'w') as fp:
#         json.dump(output, fp, default=tuple, sort_keys=True, indent=4)
#         fp.close()

count_without = count_total - count_with

outputCounts = {"Combined presence, no input": count_2, "Combined presence, input": count_3, "Single domain, no input": count_4, "Single domain, input": count_5, "Contains all listeners": count_events, "Total": count_total, "Inputs Detected in plaintext": count_inputs, "Sites with Input": count_with, "Sites without Input": count_without}
with open(outputPathCounts, 'w') as fp:
        json.dump(outputCounts, fp, default=tuple, sort_keys=True, indent=4)
        fp.close()

with open(outputPathDomains, 'w') as f:
    writer = csv.writer(f)
    for row in domain_counts.items():
        writer.writerow(row)

if len(failedSites) != 0:
    with open(failedPath, 'w') as fp:
        json.dump(failedSites, fp, default=tuple, sort_keys=True, indent=4)
        fp.close()

for number in domain_separate.keys():
    with open(outputPathDomainsSeperate + str(number) + ".csv", 'w') as f:
        writer = csv.writer(f)
        for row in domain_separate[number].items():
            writer.writerow(row)