In [None]:
import plotly.graph_objects as go
import json
import os
import argparse
import re
import logging
import  util.util as util
import requests
import pandas as pd
import importlib

from util.domainUtil import getSanitzedApps, generateTotalCountMapping, getDomainOrIP
from util.domainCategorizationUtil import isAdCategory, isCdnCategory, isSocialNetworkCategory, isAdCategoryNew, isCdnCategoryNew, isSocialNetworkCategoryNew
#importlib.reload(util.util)

In [None]:
def getAllDataWithoutRequests(app):
    result = set()
    result.update(util.getUniqueDomainsFromJson(app.amqp))
    result.update(util.getUniqueDomainsFromJson(app.coap))
    result.update(util.getUniqueDomainsFromJson(app.endpoints))
    result.update(util.getUniqueDomainsFromJson(app.mqtt))
    result.update(util.getUniqueDomainsFromJson(app.xmpp))
    result.update(util.getUniqueDomainsFromJson(app.udp))
    result.update(util.getUniqueDomainsFromJson(app.webview))


    return result


def getAllDataFromDataset(dataset):
    result = []
    for a in dataset:
        result.append(getAllDataWithoutRequests(a))

    return result

In [None]:
# TODO: set the path to load your data
# The comparison makes only sense with the newest dataset I think, all others are less comparable
input_dir_general = util.loadAllData("/", True)
input_dir_iot = util.loadAllData("/")

generalDataset = getAllDataFromDataset(input_dir_general)
iotDataset = getAllDataFromDataset(input_dir_iot)


In [None]:
%store generalDataset
%store iotDataset

In [None]:
%store -r generalDataset
%store -r iotDataset

In [None]:
from util.domainUtil import getSanitzedApps, generateTotalCountMapping, getDomainOrIP
from util.domainCategorizationUtil import isAdCategory, isCdnCategory, isSocialNetworkCategory


def getDomainsFromSubDomainSet(subdomainSet):
    result = set()
    for d in subdomainSet:
        result.add(getDomainOrIP(d))
    return result


def getDomainsFromBoth(iotSet, generalSet):
    result = set()
    for d in iotSet:
        if d in generalSet:
            result.add(getDomainOrIP(d))

    return result

def getAllDomainsFromApps(listApps):
    result = set()
    for d in listApps:
        for i in d:
            result.add(i)
    return result


def getNameForSubDomain(subDomain):
    #domain = getDomainOrIP(subDomain)

    if isAdCategory(subDomain):
        return "Advertisement and Trackers"
    elif isCdnCategory(subDomain):
        return "Content Distribution Networks"
    elif isSocialNetworkCategory(subDomain):
        return "Social Networks"
    else:
        return subDomain





In [None]:
#maybe change matching to subdomain basis

In [None]:
%store -r generalDataset
%store -r iotDataset

iotSet = getDomainsFromSubDomainSet(getAllDomainsFromApps(iotDataset))
generalSet = getDomainsFromSubDomainSet(getAllDomainsFromApps(generalDataset))

both = getDomainsFromBoth(iotSet, generalSet)

domainCategory = {}

for d in iotSet.union(generalSet):
    domain = getDomainOrIP(d)
    domainCategory[domain] = getNameForSubDomain(d)

%store domainCategory

In [None]:
domainCategory

In [None]:
iotSetSanatized = getSanitzedApps(iotDataset, False, True)
generalSetSanitized = getSanitzedApps(generalDataset, False, True)

%store iotSetSanatized
%store generalSetSanitized
#counts = generateTotalCountMapping(iotSetSanatized, {}, domainCategory)
#counts = generateTotalCountMapping(generalSetSanitized, counts, domainCategory)

In [None]:
def getExodusCategories():
    result = {}
    exodusList = json.loads(requests.get("https://reports.exodus-privacy.eu.org/api/trackers").text)
    for _,item in exodusList['trackers'].items():
        currentSet = set()
        for x in item['network_signature'].split('|'):
            if len(x) == 0:
                continue
            x = x.replace('.*', '')
            x = x.replace('\\', '')
            if x.startswith('.'):
                x = x[1:]

            currentSet.add(x)
        if len(currentSet) == 0:
            continue
        if len(item['categories']) == 0:
            tmp = result.get('otherExodus', set())
            for x in currentSet:
                tmp.add(x)
            result['otherExodus'] = tmp

        for c in item['categories']:
            tmp = result.get(c, set())
            for x in currentSet:
                tmp.add(x)
            result[c] = tmp

    return result

In [None]:
exodusCategories = getExodusCategories()

In [None]:
iotRelated = {"mob.com", "earthcam.com", "ikonkekit.com", "eye4.cn", "lgsmartplatform.com", "besmart-home.com", "everhome.cloud", "keenhome.io", "myskybell.com", "smarter.am", "hager-iot.com", "plc-smarthome.de", "leaksmart.com", "airpatrol.eu", "tp-link.com", "vesync.com", "airpatrol.eu", "samsungheartwise.com", "myfoscam.cn", "runkeeper.com", "iruniversalremote.com", "apsrvd.io", "remote-app-tv.com", "reolink.com", "neur.io", "futlight.com", "haikuhome.com", "ute-tech.com.cn", "xwemo.com", "myhager.com", "sec-smartswitch.com", "kalay.tw", "lgeaircon.com", "harmankardon.com", "getkuna.com", "withings.com", "prolink2u.com", "anymote.io", "asus-aicam.com", "tplinknbu.com", "lgmobiletv.com", "gemmy.com", "videoloft.com", "epson.net", "totwoo.com", "ambiclimate.com", "ecamzone.cc", "hostedcloudvideo.com", "chuango.cn", "swann.com", "hubbleconnected.com", "lgecloud.com", "yalelock.com", "apexisalarm.com", "alarm.com", "my-gogogate.com", "doorguard.com.au", "devismart.com", "tih.tw", "al8.co", "nucleuslife.io", "vphband.com", "vstarcam.cn", "subli-med.com", "zmodo.com", "wisenetlife.com", "libratone.com", "ihconfig.com", "rabootec.com", "miui.com", "nightowlconnect.com", "cloudwarm.net", "broadlink.com.cn", "keeprapid.com", "reolink.us", "lifesense.com", "simpledesign.ltd", "hager.fr", "simplisafe.co.uk", "iotdreamcatcher.net", "amazonalexa.com", "omguard.com", "xfinity.com", "beonhome.com", "yunjktech.com", "verizonwireless.com", "keeprapid.com", "commax.com", "audiopro.com", "abhijitvalluri.com", "net2point.com", "jimicloud.com", "zositech.com", "alcidae.com", "asuscomm.cn", "meari.com.cn", "kptncook.com", "augustint.com", "connectedfamilyhome.com", "yunyis.com", "getawair.com", "libratone.com.cn", "kalay.net.cn", "sengledcanada.com", "wifly-city.com", "sensicomfort.com", "blazeautomation.com", "neatorobotics.com", "ichano.com", "perimetersafe.com", "nightowlsp.com", "candy-hoover.com", "e-seenet.com", "ipcent.com", "wifiplug.co.uk", "mansaa.io", "asante.com", "mearicloud.com", "cloudant.com", "tendinsights.com", "incardoc.com", "shadeconnector.com", "tiktime.net", "bonlink.com.cn", "vstarcam.com","bose.com", "tuyaeu.com", "divoom-gz.com", "huahuacaocao.com", "103.235.46.40", "nutspace.com", "wizconnected.com", "jusonsmart.com", "umsns.com", "wiz.world", "wizconnected.com", "bose.io", "lifx.api.kustomerapp.com", "gulaike.com", "mi-img.com",'puwell.com', 'getqardio.com','gumplay.jp', 'chuango.com','manything.com','two-commas.com','203.195.160.110', 'tiqiaa.com', 'xiaomi.com',  'huawei.com', 'asus.com','logitech.com', 'samsungapps.com','lifx.com', 'philips.com','sony.com','mi.com','breezometer.com','palmerperformance.com', 'oppodigital.com','xiaoyi.com','ihaus.de','linquet.com','netvue.com','simplisafe.com','ikonke.com', 'mipcm.com', 'sony.co.jp', 'scinan.com', 'airtouch.com.au', 'pindora.fi', 'tado.com', 'tplinkmifi.net', 'hipcam.org','openhab.org','ttlock.com.cn',  'grundfos.com','mytenvis.com','huawei.health', 'goyourlife.com.cn',   'air-stream.com.au', 'pyronixcloud.com', 'dongha.cn',  'intesishome.com','gm.com',  'connected.baby', 'tutk.com', 'winkapp.com','gatelabs.co',  'yolanda.hk','wuudis.com','samsungsmartappliance.com','aylanetworks.com', 'heclouds.com','miwifi.com', 'almando.com','simplehuman.com',  'fitbit.com', 'home-connect.com','asuscomm.com', 'magichue.net','dropcam.com', 'domoticz.com','discovergy.com', 'clearblade.com','dronelogbook.com','ulikespk.com','longitude-watch.com','thieye.com','hom.ee','yalereallivingconnect.com', 'ihomeaudio.com',  'eero.com', 'mobihealthplus.com', 'sosocam.com','hpsmart.com','lokly.com', 'zmote.io','asuscloud.com', 'hpsmartstage.com',  'lge.com','awair.is', 'revolar.net', 'sensornetworks.co.za','allterco.com','thekeywe.com', 'm2mbackup.com', 'xm030.cn',  'traffictechservices.com','roku.com',  'oruibo.com', 'thingsview.net','hetangsmart.com', 'nightowldvr04.com','creative.com','polar.com', 'strava.com', 'nvdvr.cn','readyforsky.com', 'alarmdealer.com',  'ustream.tv','hp.com','philips-healthsuite.com.cn','myharmony.com','ifttt.com','amazfit.com','ogemray-server.com','umeye.com', 'xingcon.com','parrot.com', 'i-sens.co.kr','cloudrail.com',  'prestigio.com', 'amazon.com', 'sensiapi.io','bluedriver.com','birdytone.com', 'dev-myqnapcloud.com', 'doorbird.com',  'ifavine.com', 'usmeew.com', 'iotcplatform.com',  'doorbird.net', 'mindolife.com','actitens.com','triggi.com', 'hpsmartpie.com',  'eyez-on.com', 'linakcloud.com','mobvoi.com','dvr163.com', 'airdata.com', 'nightowlx.com', 'digixvideo.com',  'harman.com','tookok.cn', 'ezvizru.com', 'mydlink.com', 'kef.com','qnap.com.cn','huami.com','ichano.cn','beewi-home.com', 'cosa.com.tr', 'smarthome.com','linksys.com',  'autonat.com','alula.net', 'zipato.com', 'petcube.com','whistle.com', 'filtrete.com', 'dinsafer.com','ebikemotion.com', 'iquarius.com', 'nest.com', 'ictun.com','elinkthings.com','mddcloud.com','netesee.com','ikea.com', 'remoteble.com','skycentrics.com','chschs.com', 'mymili.com',  'y-cam.com','orvibo.com','sciener.com','egardia.com','amazon.co.uk', 'gazeeka.com.au',  'thingsview.co', 'bryant.com', 'goabode.com', 'vimtag.com', 'agaveha.com','myqnapcloud.com','trustsmartcloud.com','air.properties','zhiduodev.com','getdoorbird.com','aicare.net.cn','dooya.com', 'sengled.com','heyitech.com','allegrosmart.com', 'amazon.cn','action.new', 'hover1-app.com','routerlogin.net',  'routethis.co', 'childrenview.net', 'boschtt-documents.com','insteon.com', 'amazon.fr', 'gardena.com', 'vineconnected.com', 'asus.com.cn','flic.io',  'kiwi.ki','ibroadlink.com', 'sony.net', 'sfty.com', 'reco4life.com','enaikoon.de','yitechnology.com','midea.com','homescenario.com',  'sentrolcloud.com','hicloud.com',  'hicling.com','ikea.net','linksyssmartwifi.com','meethue.com', 'mypump.info', 'sonos.com', 'amazon.in','lifx.co','netgear.com', 'ipcam.so','mimoto.it','resideo.com','honeywell.com','tocaboca.com',  'yamaha.com',  'goolink.org', 'earin.com','doorbell.io', 'castify.tv', 'qnap.com', 'smart-me.com',  'm2mservices.com','nuki.io',   'b1automation.com', 'kankun-smartplug.com','ihomeaudiointl.com', 'mynuheat.com', 'wallflower.io', 'revogi.com',  'dlink.com',  'aztech.com','alarm.com', 'chipolo.net','eco-plugs.net',  'ora.vn','garagedoorbuddy.com','snsr.net',  'mobiteka.pl', 'iotworkshop.com', 'linkalock.com', 'myspotcam.com', 'wattwatchers.com.au', 'ecobee.com','commax.co.kr',  'sciener.cn',  'loco.hk', 'august.com', 'wattio.com', 'tivo.com', 'aplombtechbd.com',   'lgthinq.com', 'wink.com', 'ipc365.com', 'fivasim.com', 'eufylife.com',    'ticwear.com',   'alarmnet.com', 'guardingvision.com', 'rcti.es','xiaomi.net',  'amazon.co.jp','goyourlife.com','routethis.com',  'ablecloud.cn','skyward.io','ipcamlive.com', 'bluecats.com','jellyfishtur.cn','ezvizlife.com', 'remotexy.com', 'idevicesinc.com', 'linkplay.com',  'opple.com', 'koogeek.com','iwhop.com', 'logi.com', 'meross.com',  'appnimator.com', 'electricimp.com', 'getblueshift.com', 'mein-henry.de', 'ihomecontrol.de','tomtom.com','bn-link.com','artik.cloud', 'tenvis.com', 'samsung.com','miot-spec.org', 'philips-digital.com',  'ween.fr','chipolo.com', 'roc-connect.com', 'dy1000.com', 'securesmart.cn', 'smanos.com', 'mangocam.com', 'appmifile.com',  'sleepace.net', 'cranesportsconnect.com','trafficland.com',  'rialtocomfort.com',   'myfieldnet.com', 'energy-aware.com', 'wificam.org',  'ieiworld.com','myedimax.com', 'muzzley.com', 'u-blox.com','carrier.com','neolock.vn', 'securemote.com', 'sense-u.com','yeelight.com', 'epson.com', 'feetguider.com', 'netatmo.net', 'monitoreverywhere.com',  'netpie.io',  'inatronic.com',    'fitdigits.com', 'riversongapp.net', 'orbitbhyve.com',  'utc.com', 'videoexpertsgroup.com', 'cloudlinks.cn', 'sleepace.com', 'veepai.com',  'vicohome.io', 'mygeostar.com', 'netatmo.com'  }

In [None]:
def isIotRelatedSubdomain(subDomain):
    subdomainSplitted = subDomain.split(".")
    for domain in iotRelated:
        iotDomainSplitted = domain.split(".")
        if len(subdomainSplitted) < len(iotDomainSplitted):
            continue

        matched = True
        for i in range(0, len(iotDomainSplitted)):
            if iotDomainSplitted[len(iotDomainSplitted) - i -1] != subdomainSplitted[len(subdomainSplitted) -i -1]:
                matched = False
                break

        if matched:
            return True


    return False

In [None]:
def getClassification(subDomain):
    if subDomain.endswith("."):
        return subDomain
    for key,item in exodusCategories.items():
        toReturn = key
        if key == 'otherExodus' or key == 'Advertisement':
            toReturn = "Advertisement and Trackers"
        if key == "Crash reporting":
            toReturn = 'Crash Reporting'

        subdomainSplitted = subDomain.split(".")
        for entry in item:
            entrySplitted = entry.split(".")
            if len(subdomainSplitted) >= len(entrySplitted):
                matched = True
                for i in range(0, len(entrySplitted)):
                    if entrySplitted[len(entrySplitted) - i - 1] != subdomainSplitted[len(subdomainSplitted) -i -1]:
                        matched = False
                        break
                if matched:
                    return toReturn


    if isCdnCategoryNew(subDomain):
        return "Content Distribution Networks"
    elif isAdCategoryNew(subDomain):
        return "Advertisement and Trackers"
    elif isSocialNetworkCategoryNew(subDomain):
        return "Social Networks"
    #elif isIotRelatedSubdomain(subDomain): # todo check
    #    return "IoT Related"

    return subDomain

In [None]:
domainCategory = {}

for d in iotSet.union(generalSet):
    domain = getDomainOrIP(d)
    domainCategory[domain] = getClassification(d)

%store domainCategory

In [None]:
%store -r iotSetSanatized
%store -r generalSetSanitized
%store -r domainCategory

countsIoT = generateTotalCountMapping(iotSetSanatized, {}, domainCategory)
countsGeneral = generateTotalCountMapping(generalSetSanitized, {}, domainCategory)

%store countsIoT
%store countsGeneral

In [None]:
%store -r countsIoT
%store -r countsGeneral

In [None]:
%store -r domainCategory

%store -r iotSetSanatized
%store -r generalSetSanitized
%store -r countsIoT
%store -r countsGeneral

In [None]:

# Create a map from node to id

# Value total counts from Map -> is a bit missleading because it is not representing the number of apps analyzed
cutOff = 30
allDomainMap = {}
other = set()
for x in set(countsIoT.keys()):
    if x == 'found.at':
        del countsIoT[x]
        continue
    #if countsIoT.get(x,0) < cutOff and countsGeneral.get(x,0) ==0:
    #    countsIoT['Other IoT'] = countsIoT.get('Other IoT', 0) + countsIoT.get(x,0)
    #    del countsIoT[x]
    if '.' in x or (countsIoT.get(x,0) < cutOff and countsGeneral.get(x,0) < cutOff):
        other.add(x)
        countsIoT['Other'] = countsIoT.get('Other', 0) + countsIoT.get(x,0)
        del countsIoT[x]


for x in set(countsGeneral.keys()):
    if x == 'found.at':
        del countsGeneral[x]
        continue
    #if x in other:
    #    countsGeneral['Other'] = countsGeneral.get('Other', 0) + countsGeneral.get(x,0)
    #    del countsGeneral[x]
    if '.' in x or (countsGeneral.get(x,0) < cutOff and countsIoT.get(x,0) < cutOff):
        countsGeneral['Other'] = countsGeneral.get('Other', 0) + countsGeneral.get(x,0)
        del countsGeneral[x]

for x in countsIoT.keys():
    allDomainMap[x] = countsIoT.get(x,0) + countsGeneral.get(x,0)


for x in countsGeneral.keys():
    allDomainMap[x] = countsIoT.get(x,0) + countsGeneral.get(x,0)

#both = getDomainsFromBoth(countsIoT, countsGeneral)

sortedDomains = sorted(allDomainMap.items(), key=lambda kv: kv[1], reverse=True)
domainId = {}
idDomain = {}
i = 2
for domain in sortedDomains:
    domain = domain[0]
    if domain.endswith("."):
        domain = domain[0:len(domain)-1]

    domainId[domain] = i
    idDomain[i] = domain
    i = i + 1

print(domainId)

In [None]:
source = []
target = []
values = []
labels = []
y= [0.5,0.5]
x= [0, 1]
for key, value in countsIoT.items():
    #if value < 25:
    #    continue
    source.append(0)
    target.append(domainId[key])
    values.append(value)


for key, value in countsGeneral.items():
    #if value < 25:
    #    continue
    target.append(domainId[key])
    source.append(1)
    values.append(value)



labels.append("IoT-2022")
labels.append("GP-2022")
usedSize = 0
for l in range(2,i):
    labels.append(idDomain[l])
    if countsGeneral.get(idDomain[l], 0) > 0 and countsIoT.get(idDomain[l], 0) >0:
        x.append(0.5)
    elif countsIoT.get(idDomain[l], 0) > 0:
        x.append(0.25)
    else:
        x.append(0.75)
    y.append(0.05*(l) + usedSize/30000 )
    usedSize = usedSize + countsGeneral.get(idDomain[l], 0) + countsIoT.get(idDomain[l], 0)




In [None]:
import plotly.graph_objects as go

link = dict(source = source, target = target, value = values)
node = dict(label = labels,
        x= x,
        y= y)
data = go.Sankey(link = link, node=node)
# plot
fig = go.Figure(data)
fig.update_layout(font_size=16)

fig.show()

In [None]:
import time
fig.write_image("sankey_comparison.pdf")
time.sleep(2)
fig.write_image("sankey_comparison.pdf")

In [None]:
countsGeneral

In [None]:
source = []
target = []
values = []
labels = []

for key, value in countsIoT.items():
    if value < 20:
        continue
    source.append(0)
    target.append(domainId[key])
    values.append(value)


for key, value in countsGeneral.items():
    if value < 20:
        continue
    target.append(1)
    source.append(domainId[key])
    values.append(value)



labels.append("IoT apps")
labels.append("General apps")
for l in range(2,i):
    labels.append(idDomain[l])



link = dict(source = source, target = target, value = values)
node = dict(label = labels, x= x,
        y= y)
data = go.Sankey(link = link, node=node)
# plot
fig = go.Figure(data)
fig.show()

In [None]:
sorted_x = sorted(countsIoT.items(), key=lambda kv: kv[1], reverse=True)

In [None]:
for x in sorted_x:
    print(x[0])


In [None]:
countsIoT['Other']

In [None]:
countsGeneral

In [None]:
other

In [None]:
exodusCategories

In [None]:
%store -r domainCategory



In [None]:
domainCategory

In [None]:
%store -r countsIoT
%store -r countsGeneral

In [None]:
countsGeneral

In [None]:
sorted(countsIoT.items(), key=lambda x: x[1], reverse=True)

In [None]:
sorted(countsGeneral.items(), key=lambda x: x[1], reverse=True)

In [None]:
len(countsIoT)

In [None]:
len(countsGeneral)

In [None]:
allDomains = set()

for key in countsIoT.keys():
    allDomains.add(key)

for key in countsGeneral.keys():
    allDomains.add(key)

In [None]:
def getAllAdAndTracker(appList, categories):
    result = {}
    for app in appList:
        for key,value in app.items():
            if categories.get(key, '') == 'Advertisement and Trackers':
                result[key] = result.get(key, 0) + value
    return result

In [None]:
# TODO: set the path to your result folder
# set the first argument to the base path of the result folder, second argument is th folder name, third argument a mapping file - that tells for apps in multiple sub datasets which app to include
verified = util.get_verified_dataset("/", "/2023_04_06/", "/verified_dataset/result.json")
verifiedDataset = getAllDataFromDataset(verified)
verifiedSet = getDomainsFromSubDomainSet(getAllDomainsFromApps(verifiedDataset))
verifiedSetSanatizedSubdomains = getSanitzedApps(verifiedDataset, True, False)
verifiedSetSanatized = getSanitzedApps(verifiedDataset, False, True)

verifiedIoT = generateTotalCountMapping(verifiedSetSanatized, {}, domainCategory)
adAndTrackerVerified = getAllAdAndTracker(verifiedSetSanatized, domainCategory)

%store verifiedDataset
%store verifiedSet
%store verifiedSetSanatizedSubdomains
%store verifiedSetSanatized
%store verifiedIoT
%store adAndTrackerVerified


In [None]:
%store -r iotSetSanatized
%store -r generalSetSanitized

%store -r countsIoT
%store -r countsGeneral

%store -r domainCategory

%store -r iotSetSanatized
%store -r generalSetSanitized
%store -r countsIoT
%store -r countsGeneral

In [None]:
%store -r iotSetSanatized
%store -r generalSetSanitized

In [None]:
%store -r verifiedDataset
%store -r verifiedSet
%store -r verifiedSetSanatizedSubdomains
%store -r verifiedSetSanatized
%store -r verifiedIoT
%store -r adAndTrackerVerified

In [None]:
adAndTrackerIoT = getAllAdAndTracker(iotSetSanatized, domainCategory)
adAndTrackerGeneral = getAllAdAndTracker(generalSetSanitized, domainCategory)

In [None]:
def countTotalAd(adMap):
    count = 0
    for key, value in adMap.items():
        count = count + value
    return count

In [None]:
countTotalAd(adAndTrackerIoT)

In [None]:
countTotalAd(adAndTrackerGeneral)

In [None]:
len(adAndTrackerIoT.keys() | (adAndTrackerGeneral.keys()))

In [None]:
allDomains = adAndTrackerIoT.keys() | (adAndTrackerGeneral.keys())

In [None]:
print(len(allDomains))

In [None]:
def numberContained(allAd, iot, general):
    count = 0
    both = set()
    onlyIoT = set()
    onlyGeneral = set()
    for c in allAd:
        if c in iot and c in general:
            count = count + 1
            both.add(c)
        elif c in iot:
            if "google.com" in c:
                continue
            onlyIoT.add(c)
        else:
            onlyGeneral.add(c)
    return count, onlyIoT, onlyGeneral, both

In [None]:
countBoth, iotDomains, generalDomains, bothDomains = (numberContained(allDomains, adAndTrackerIoT.keys(),  adAndTrackerGeneral.keys()))

In [None]:
countBoth

In [None]:
len(iotDomains)

In [None]:
len(generalDomains)

In [None]:
len(bothDomains)

In [None]:
def countValues(mapToCount, subset):
    count = 0
    for k,v in mapToCount.items():
        if k in subset:
            count = count + v
    return count

In [None]:
countValues(adAndTrackerIoT, iotDomains)

In [None]:
countValues(adAndTrackerIoT, bothDomains)

In [None]:
countValues(adAndTrackerGeneral, bothDomains)

In [None]:
countValues(adAndTrackerGeneral, generalDomains)

In [None]:
%store -r iotSetSanatized
%store -r generalSetSanitized
%store -r domainCategory


%store -r countsIoT
%store -r countsGeneral

In [None]:
iotSetSanatized

In [None]:
countsIoT

In [None]:
countsGeneral

In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = ["dynamic", "static", "Ad", "Ad both", "other", "other both"],
      color = "blue"
    ),
    link = dict(
      source = [0, 1, 2, 0, 1, 4], # indices correspond to labels, eg A1, A2, A1, B1, ...
      target = [2, 2, 3,4,4,5 ],
      value = [2, 4, 3, 4,3,2]
  ))])

fig.update_layout(title_text="Basic Sankey Diagram", font_size=10)
fig.show()

In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = ["dynamic", "static", "Ad"],
      color = "blue"
    ),
    link = dict(
      source = [0, 0, 1, 1], # indices correspond to labels, eg A1, A2, A1, B1, ...
      target = [2, 2, 2, 2],
      value = [2, 4, 3, 1]
  ))])

fig.update_layout(title_text="Basic Sankey Diagram", font_size=10)
fig.show()

In [None]:
######## new

In [None]:
iotSetSanatizedSubdomains = getSanitzedApps(iotDataset, True, False)
generalSetSanitizedSubdomains = getSanitzedApps(generalDataset, True, False)

In [None]:
%store iotSetSanatizedSubdomains
%store generalSetSanitizedSubdomains


In [None]:
%store -r iotSetSanatizedSubdomains
%store -r generalSetSanitizedSubdomains


In [None]:
def getDomainSet(dataset):
    result = set()
    for app in dataset:
        for d in app:
            result.add(d)
    return result

In [None]:
allSubDomains = getDomainSet(generalSetSanitizedSubdomains).union(getDomainSet(iotSetSanatizedSubdomains)).union(getDomainSet(verifiedSetSanatizedSubdomains))

In [None]:
classifiedMap = {}
for d in allSubDomains:
    classifiedMap[d] = getClassification(d)

In [None]:
def generateCount(dataset):
    result = {}
    for app in dataset:
        for subDomain in app:
            result[subDomain] = result.get(subDomain, 0) + 1
    return result

In [None]:
categoryId = {}
idCategory = {}
current = 4
for k,v in classifiedMap.items():
    if "." not in v and v not in categoryId:
        categoryId[v] =  current
        idCategory[current] = v
        current = current + 1

In [None]:
verified_subdomainSet = getDomainSet(verifiedSetSanatizedSubdomains)


In [None]:
gp_subdomainSet = getDomainSet(generalSetSanitizedSubdomains)
iot_subdomainSet = getDomainSet(iotSetSanatizedSubdomains)

In [None]:
countIoT = generateCount(iotSetSanatizedSubdomains)
countGP = generateCount(generalSetSanitizedSubdomains)

In [None]:
source = []
target = []
values = []
labels = []


valueDict = {}


for d in allSubDomains:
    targetId = 0
    if "." in classifiedMap[d]:
        targetId = 3
    else:
        targetId = categoryId[classifiedMap[d]]



    if d in gp_subdomainSet and d in iot_subdomainSet:
        #source.append(1)
        #target.append(targetId)
        #values.append(1)
        key = f"1-{targetId}"
        valueDict[key] = valueDict.get(key, 0) + 1 #countIoT[d] + countGP[d]
    elif d in gp_subdomainSet:
        key = f"0-{targetId}"
        valueDict[key] = valueDict.get(key, 0) + 1 #countGP[d]
        #source.append(0)
        #target.append(targetId)
        #values.append(1)
    else:
        key = f"2-{targetId}"
        valueDict[key] = valueDict.get(key, 0) + 1 #countIoT[d]
        #source.append(2)
        #target.append(targetId)
        #values.append(1)

for k,v in valueDict.items():
    sourceValue = k.split('-')[0]
    targetValue = k.split('-')[1]
    source.append(sourceValue)
    target.append(targetValue)
    values.append(valueDict[k])


labels.append("GP-2022")
labels.append("GP-2022 & IoT-2022")
labels.append("IoT-2022")
labels.append("Other")

for l in range(4,current):
    labels.append(idCategory[l])



link = dict(source = source, target = target, value = values)
node = dict(label = labels)
data = go.Sankey(link = link, node=node)
# plot
fig = go.Figure(data)
fig.show()

In [None]:
source = []
target = []
values = []
labels = []


valueDict = {}


for d in allSubDomains:
    targetId = 0
    if "." in classifiedMap[d]:
        targetId = 3
    else:
        targetId = categoryId[classifiedMap[d]]



    if d in gp_subdomainSet and d in iot_subdomainSet:
        #source.append(1)
        #target.append(targetId)
        #values.append(1)
        key = f"1-{targetId}"
        valueDict[key] = valueDict.get(key, 0) + countIoT[d] + countGP[d]
    elif d in gp_subdomainSet:
        key = f"0-{targetId}"
        valueDict[key] = valueDict.get(key, 0) + countGP[d]
        #source.append(0)
        #target.append(targetId)
        #values.append(1)
    else:
        key = f"2-{targetId}"
        valueDict[key] = valueDict.get(key, 0) + countIoT[d]
        #source.append(2)
        #target.append(targetId)
        #values.append(1)

for k,v in valueDict.items():
    sourceValue = k.split('-')[0]
    targetValue = k.split('-')[1]
    source.append(sourceValue)
    target.append(targetValue)
    values.append(valueDict[k])


labels.append("GP-2022")
labels.append("GP-2022 & IoT-2022")
labels.append("IoT-2022")
labels.append("Other")

for l in range(4,current):
    labels.append(idCategory[l])



link = dict(source = source, target = target, value = values)
node = dict(label = labels)
data = go.Sankey(link = link, node=node)
# plot
fig = go.Figure(data)
fig.show()

In [None]:
import time
fig.write_image("sankey_iot_gp_new.pdf")
time.sleep(2)
fig.write_image("sankey_iot_gp_new.pdf")


In [None]:
## not overlapping kind of right

In [None]:
source = []
target = []
values = []
labels = []


valueDict = {}


for d in allSubDomains:
    targetId = 0
    if "." in classifiedMap[d]:
        targetId = 3
    else:
        targetId = categoryId[classifiedMap[d]]



    if d in gp_subdomainSet:
        key = f"0-{targetId}"
        valueDict[key] = valueDict.get(key, 0) + countGP[d]
        #source.append(0)
        #target.append(targetId)
        #values.append(1)

    if d in iot_subdomainSet:
        key = f"2-{targetId}"
        valueDict[key] = valueDict.get(key, 0) + countIoT[d]
        #source.append(2)
        #target.append(targetId)
        #values.append(1)

for k,v in valueDict.items():
    sourceValue = k.split('-')[0]
    targetValue = k.split('-')[1]
    source.append(sourceValue)
    target.append(targetValue)
    values.append(valueDict[k])


labels.append("GP-2022")
labels.append("GP-2022 & IoT-2022")
labels.append("IoT-2022")
labels.append("Other")

for l in range(4,current):
    labels.append(idCategory[l])



link = dict(source = source, target = target, value = values)
node = dict(label = labels)
data = go.Sankey(link = link, node=node)
# plot
fig = go.Figure(data)
fig.show()

In [None]:
for k,v in classifiedMap.items():
    if '.' in v:
        print(v)

In [None]:

def getAppClassificationDataset(dataset):
    result = []
    for current in dataset:
        current_classification = {}
        for d,v in current.items():
            key = ""
            if "." in classifiedMap[d]:
                key = "Other"
            elif "Location" == classifiedMap[d]:
                key = "Other"
            elif "Profiling" == classifiedMap[d]:
                key = "Advertisement and Trackers"
            else:
                key = classifiedMap[d]

            current_classification[key] = current_classification.get(key, 0) + 1

        result.append(current_classification)
    return result



In [None]:
iot_app_classification = getAppClassificationDataset(iotSetSanatizedSubdomains)
gp_app_classifictaion = getAppClassificationDataset(generalSetSanitizedSubdomains)


In [None]:
def countApps(dataset):
    result = {}
    for app in dataset:
        for k,v in app.items():
            result[k] = result.get(k, 0) + 1

    return result

In [None]:
app_count_iot = countApps(iot_app_classification)
app_count_gp = countApps(gp_app_classifictaion)

In [None]:
app_count_gp

In [None]:
gp = {}
iot = {}
shared_domains = {}
verified = {}
shared_domains_verified = {}


for d in allSubDomains:
    key = ""
    if "." in classifiedMap[d]:
        key = "Other"
    elif "Location" == classifiedMap[d]:
        key = "Other"
    elif "Profiling" == classifiedMap[d] or "Identification" in classifiedMap[d]:
        key = "Advertisement and Trackers"
    else:
        key = classifiedMap[d]


    if d in verified_subdomainSet:
        verified[key] = verified.get(key, 0) + 1


    if d in gp_subdomainSet and d in verified_subdomainSet:
        shared_domains_verified[key] = shared_domains_verified.get(key, 0) + 1 #countIoT[d]


    if d in gp_subdomainSet and d in iot_subdomainSet:
        shared_domains[key] = shared_domains.get(key, 0) + 1 #countIoT[d]
    if d in gp_subdomainSet:
        gp[key] = gp.get(key, 0) + 1 #countGP[d]
        #source.append(0)
        #target.append(targetId)
        #values.append(1)
    if d in iot_subdomainSet:
        iot[key] = iot.get(key, 0) + 1 #countIoT[d]

In [None]:
allKeys = set()
for key in gp.keys():
    allKeys.add(key)

for key in iot.keys():
    allKeys.add(key)

for key in shared_domains:
    allKeys.add(key)

for key in shared_domains_verified:
    allKeys.add(key)

for key in verified:
    allKeys.add(key)

In [None]:
def get_classification_per_app_dataset(dataset):
    result = {}
    for key in allKeys:
        result[key] = []
    for app in dataset:
        for key in allKeys:
            result[key].append(app.get(key, 0))
    return result

In [None]:
# Import statistics Library
import statistics

In [None]:
iot_app_classified_prepared = get_classification_per_app_dataset(iot_app_classification)
gp_app_classified_prepared = get_classification_per_app_dataset(gp_app_classifictaion)

In [None]:
verified_app_classifictaion = getAppClassificationDataset(verifiedSetSanatizedSubdomains)

verified_app_classified_prepared = get_classification_per_app_dataset(verified_app_classifictaion)

In [None]:
app_count_verified = countApps(verified_app_classifictaion)

In [None]:
tableDict_verified = {}
for key in allKeys:
    verified_string =f"{round(statistics.mean(verified_app_classified_prepared[key]), 2)} ({round(statistics.stdev(verified_app_classified_prepared[key]), 2)})"

    gp_string = f"{round(statistics.mean(gp_app_classified_prepared[key]), 2)} ({round(statistics.stdev(gp_app_classified_prepared[key]), 2)})"
    iot_string =f"{round(statistics.mean(iot_app_classified_prepared[key]), 2)} ({round(statistics.stdev(iot_app_classified_prepared[key]), 2)})"
    tableDict_verified[key] = [app_count_verified.get(key, 0) ,app_count_gp.get(key,0), app_count_iot.get(key, 0),verified.get(key,0) ,gp.get(key, 0), iot.get(key, 0), shared_domains.get(key, 0), shared_domains_verified.get(key, 0), max(verified_app_classified_prepared[key]) ,max(gp_app_classified_prepared[key]), max(iot_app_classified_prepared[key]),verified_string, gp_string, iot_string]

index_verified = ["IoT-VER", "GP-2022", "IoT-2022", "IoT-VER FQDNs","GP-2022 FQDNs", "IoT-2022 FQDNs", "Shared Domains", "Shared Domains-VER", "max-IoT-VER" ,"max-GP", "max-IoT", "Avg (Std) VER", "Avg (Std) GP", "Avg (Std) IoT"]

In [None]:
df_verified = pd.DataFrame(tableDict_verified, index = index_verified)

print(df_verified.to_latex())

In [None]:
df_verified

In [None]:
df_verified = df_verified.reindex(columns=sorted(df_verified.columns))

In [None]:
sortedVerifiedColumns = sorted(df_verified.columns)


In [None]:
#sortedVerifiedColumns[4] = 'Social Networks'
#sortedVerifiedColumns[5] = 'Other'


In [None]:
#df_verified = df_verified.reindex(columns=sortedVerifiedColumns)
#df_verified.iloc[1]

In [None]:
formatted_GP = (df_verified.iloc[1]*100/947).apply("{0:.2f}%".format)
formatted_IoT = (df_verified.iloc[2]*100/1260).apply("{0:.2f}%".format)
formatted_verified = (df_verified.iloc[0]*100/9889).apply("{0:.2f}%".format)

formatted_verified_row = []

for name,value in df_verified.iloc[0].items():
    formatted_verified_row.append(f"{value} ({formatted_verified[name]})")


formatted_iot_row = []

for name,value in df_verified.iloc[2].items():
    formatted_iot_row.append(f"{value} ({formatted_IoT[name]})")

formatted_gp_row = []

for name,value in df_verified.iloc[1].items():
    formatted_gp_row.append(f"{value} ({formatted_GP[name]})")

df_verified.iloc[0] = formatted_verified_row

df_verified.iloc[2] = formatted_iot_row

df_verified.iloc[1] = formatted_gp_row

df_verified.transpose()

print(df_verified.to_latex())
print(df_verified.transpose().to_latex())

df_verified = df_verified.drop(['max-GP', 'max-IoT', "max-IoT-VER"])

df_verified = df_verified.reindex(["IoT-VER FQDNs","GP-2022 FQDNs","IoT-2022 FQDNs", "Shared Domains", "Shared Domains-VER","IoT-VER" ,"GP-2022","IoT-2022", "Avg (Std) VER","Avg (Std) GP","Avg (Std) IoT"])




In [None]:
print(df_verified.transpose().to_latex())

In [None]:
df_verified.transpose()

In [None]:
tableDict = {}
for key in allKeys:
    gp_string = f"{round(statistics.mean(gp_app_classified_prepared[key]), 2)} ({round(statistics.stdev(gp_app_classified_prepared[key]), 2)})"
    iot_string =f"{round(statistics.mean(iot_app_classified_prepared[key]), 2)} ({round(statistics.stdev(iot_app_classified_prepared[key]), 2)})"
    tableDict[key] = [app_count_gp.get(key,0), app_count_iot.get(key, 0), gp.get(key, 0), iot.get(key, 0), shared_domains.get(key, 0), max(gp_app_classified_prepared[key]), max(iot_app_classified_prepared[key]),gp_string, iot_string]

index = ["GP-2022", "IoT-2022", "GP-2022 FQDNs", "IoT-2022 FQDNs", "Shared Domains","max-GP", "max-IoT", "Avg (Std) GP", "Avg (Std) IoT"]

In [None]:
df = pd.DataFrame(tableDict, index = index)

print (df.to_latex())

In [None]:
df

In [None]:
df = df.reindex(columns=sorted(df.columns))

In [None]:
sortedColumns = sorted(df.columns)

In [None]:
sortedColumns[4] = 'Social Networks'
sortedColumns[5] = 'Other'

In [None]:
df = df.reindex(columns=sortedColumns)

In [None]:
df.iloc[1]

In [None]:
df

In [None]:
formatted_GP = (df.iloc[0]*100/947).apply("{0:.2f}%".format)

In [None]:
formatted_IoT = (df.iloc[1]*100/1260).apply("{0:.2f}%".format)

In [None]:
formatted_iot_row = []

for name,value in df.iloc[1].items():
    formatted_iot_row.append(f"{value} ({formatted_IoT[name]})")

In [None]:
formatted_gp_row = []

for name,value in df.iloc[0].items():
    formatted_gp_row.append(f"{value} ({formatted_GP[name]})")

In [None]:
df.iloc[1] = formatted_iot_row

In [None]:
df.iloc[0] = formatted_gp_row

In [None]:
df.transpose()

In [None]:
print(df.to_latex())

In [None]:
print(df.transpose().to_latex())

In [None]:
df = df.drop(['max-GP', 'max-IoT'])

In [None]:
df

In [None]:
print(df.transpose().to_latex())

In [None]:
df = df.reindex(["GP-2022 FQDNs","IoT-2022 FQDNs", "Shared Domains","GP-2022","IoT-2022", "Avg (Std) GP","Avg (Std) IoT"])

In [None]:
print(df.transpose().to_latex())

In [None]:
df

In [None]:
df.transpose()