In [1]:
import pandas as pd
import glob
import os
import json
import numpy as np
from  cycler import cycler
import fastplot
from collections import Counter, defaultdict
from statistics import mode
import statsmodels.stats.proportion
import re
# Only us-ohio for USA
COUNTRY_NAME = "UK"

CYCLER_LINESPOINTS=(cycler('color', ['r', 'b', 'g', 'purple', 'c']) +
                    cycler('linestyle', ['-', '--', '-.', ':', (0, (3, 1, 1, 1)) ]) +
                    cycler('marker', ['o', 's', 'v', 'd', '^' ]))

CYCLER_LINES=(cycler('color', ['r', 'b', 'g', 'purple', 'c']) +
                    cycler('linestyle', ['-', '--', '-.', ':', (0, (3, 1, 1, 1)) ]))

PLOT_ARGS = {"style":"latex", "grid":True, "figsize":(4.25,2.5), "rcParams":{'text.latex.preamble': '\\usepackage{libertine}\n\\usepackage{setspace}'}, "legend_args":{"columnspacing":1, "handlelength":1.5}}
PLOT_ARGS_LARGE = {"style":"latex", "grid":True, "figsize":(9,2.75), "rcParams":{'text.latex.preamble': '\\usepackage{libertine}\n\\usepackage{setspace}'}, "legend_args":{"columnspacing":1, "handlelength":1.5}}

BEFORE_STR = "\\emph{Before-Accept}"
AFTER_STR = "\\emph{After-Accept}"
ADDITIONAL_STR = "\\emph{Additional-Visits}"

TRANSLATE_CATEGORIES = {"Computers Electronics and Technology": "Computers and Technology",
                        "Business and Consumer Services": "Business and Services",
                        "Heavy Industry and Engineering": "Industry and Engineering"}



In [2]:
trackers = set( open("trackers-list.txt", "r").read().splitlines() )
df = pd.read_csv(f"../data/{COUNTRY_NAME}_output.csv")
similarweb_websites=defaultdict(set)
plot_country_name = COUNTRY_NAME
if COUNTRY_NAME.startswith("us"):
    COUNTRY_NAME = "USA"

    
similarweb_websites[COUNTRY_NAME] = set(open(f"similarweb/{COUNTRY_NAME}.txt", "r" ).read().splitlines()) 
    

In [3]:
# Initialize a list to store indices of rows to be deleted
rows_to_delete = []

for index, row in df.iterrows():
    # Check if the specified columns are empty for the row
    if row['cookie_first'] == "[]" and row['cookie_click'] == "[]" and row['cookie_internal'] == "[]":
        rows_to_delete.append(index)

# Drop the identified rows
df.drop(rows_to_delete, inplace=True)
df.reset_index(drop=True, inplace=True)
print(f"Number of rows deleted: {len(rows_to_delete)}")

Number of rows deleted: 206


In [4]:
bad_domains=set("co.uk co.jp co.hu co.il com.au co.ve .co.in com.ec com.pk co.th co.nz com.br com.sg com.sa \
com.do co.za com.mx com.ly com.ua com.eg com.pe com.tr co.kr com.ng com.pe com.pk co.th \
com.au com.ph com.my com.tw com.ec com.kw co.in co.id com.com com.vn com.bd com.ar \
com.co com.vn org.uk net.gr".split())

def getGood2LD(fqdn):
    if fqdn[-1] == ".":
        fqdn = fqdn[:-1]    
    names = fqdn.split(".")
    if ".".join(names[-2:]) in bad_domains:
        return get3LD(fqdn)
    tln_array = names[-2:]
    tln = ""
    for s in tln_array:
        tln = tln + "." + s
    return tln[1:]

def get3LD(fqdn):
    if fqdn[-1] == ".":
        fqdn = fqdn[:-1]
    names = fqdn.split(".")
    tln_array = names[-3:]
    tln = ""
    for s in tln_array:
        tln = tln + "." + s
    return tln[1:]

In [5]:
print(df.iloc[0])

url                                            http://campaign.gov.uk
landing_page                                      https://www.gov.uk/
has_found_banner                                                False
cookie_first        [{"domain": "www.gov.uk", "expires": 174115728...
cookie_click        [{"domain": "www.gov.uk", "expires": 174115728...
cookie_internal     [{"domain": "www.gov.uk", "expires": 174115728...
domains_first                                          ["www.gov.uk"]
domains_click                                          ["www.gov.uk"]
Name: 0, dtype: object


In [6]:
df["sld"] = df["url"].apply(lambda u: getGood2LD(u.split("/")[2] ).strip() ) 
df["tld"] = df["url"].apply(lambda u: get3LD(u.split("/")[2] ).strip()  ) 

In [7]:
print(df.iloc[0])

url                                            http://campaign.gov.uk
landing_page                                      https://www.gov.uk/
has_found_banner                                                False
cookie_first        [{"domain": "www.gov.uk", "expires": 174115728...
cookie_click        [{"domain": "www.gov.uk", "expires": 174115728...
cookie_internal     [{"domain": "www.gov.uk", "expires": 174115728...
domains_first                                          ["www.gov.uk"]
domains_click                                          ["www.gov.uk"]
sld                                                            gov.uk
tld                                                   campaign.gov.uk
Name: 0, dtype: object


In [8]:
df[COUNTRY_NAME] = True
print(df.iloc[0])

url                                            http://campaign.gov.uk
landing_page                                      https://www.gov.uk/
has_found_banner                                                False
cookie_first        [{"domain": "www.gov.uk", "expires": 174115728...
cookie_click        [{"domain": "www.gov.uk", "expires": 174115728...
cookie_internal     [{"domain": "www.gov.uk", "expires": 174115728...
domains_first                                          ["www.gov.uk"]
domains_click                                          ["www.gov.uk"]
sld                                                            gov.uk
tld                                                   campaign.gov.uk
UK                                                               True
Name: 0, dtype: object


In [9]:
similarweb_csv = pd.read_csv("similarweb/together.csv")

In [10]:
# adding categories to the dataframe
similarweb_csv["category"] = similarweb_csv["category"].apply(lambda s: TRANSLATE_CATEGORIES[s] if s in TRANSLATE_CATEGORIES else s)
categories = defaultdict(set)
category_name = COUNTRY_NAME
if COUNTRY_NAME == "HongKong":
    category_name = "Hong Kong"
for row in similarweb_csv.itertuples():
    if row.category != "ALL" and row.country in category_name :
        categories[row.category].add(row.website)
        
for category in categories:
    df[category] = (df.sld.isin(categories[category])) | (df.tld.isin(categories[category]))
print(df[df["Adult"]==True])

                            url                  landing_page  \
9     http://xmegaxvideoxxx.com       https://bigsport.today/   
40             http://erome.com        https://www.erome.com/   
43            http://fansly.com           https://fansly.com/   
62       http://boyfriendtv.com  https://www.boyfriendtv.com/   
102              http://dsg.bio         https://xhamster.com/   
...                         ...                           ...   
1824        http://pornpics.com     https://www.pornpics.com/   
1844         http://thisvid.com          https://thisvid.com/   
1884    http://sexfamilysim.net  https://en.simfamilysex.net/   
1982         http://tnaflix.com      https://www.tnaflix.com/   
1984            http://txxx.com             https://txxx.com/   

      has_found_banner                                       cookie_first  \
9                False  [{"domain": "bigsport.today", "expires": 17096...   
40               False  [{"domain": "www.erome.com", "expires": 1

In [11]:
# getting the trackers for before, after and internal banner.
# Note that only checking the tracker that is in the tracker list and not expried
def get_trackers(s):
    d = json.loads(s)
    this_trackers = set()
    for c in d:
        if (c["domain"] in trackers or getGood2LD(c["domain"]) in trackers or get3LD(c["domain"]) in trackers) and c["expires"] > 0:
            this_trackers.add(getGood2LD(c["domain"]))
    return list(this_trackers)

df["trackers_first"] = df["cookie_first"].apply(get_trackers)
df["trackers_click"] = df["cookie_click"].apply(get_trackers)
df["trackers_internal"] = df["cookie_internal"].apply(get_trackers)

In [12]:

def website_stats(this_df):
    num = len(this_df.index)
    trackers_first = set()
    trackers_click = set()
    trackers_internal = set()
    
    trackers_first_all = set(this_df.iloc[0].trackers_first)
    trackers_click_all = set(this_df.iloc[0].trackers_click)
    trackers_internal_all = set(this_df.iloc[0].trackers_internal)
    trackers_details = []

    has_found_banner = False
    
    tp_first = set()
    tp_click = set()
    
    countries = { c:False for c in similarweb_websites.keys()}
    cats = { c:False for c in categories.keys()}
    #+ list(similarweb_websites.keys()) + list(categories.keys())
    
    
    for i, row in this_df.iterrows():

        trackers_first |= set(row.trackers_first)
        trackers_click |= set(row.trackers_click)
        trackers_internal |= set(row.trackers_internal)
        
        trackers_first_all &= set(row.trackers_first)
        trackers_click_all &= set(row.trackers_click)
        trackers_internal_all &= set(row.trackers_internal)
        
        trackers_details.append(set(row.trackers_click))
        
        tp_first |= set([ getGood2LD(d) for d in json.loads(row.domains_first) if d != ""] )
        tp_click |= set([ getGood2LD(d) for d in json.loads(row.domains_click) if d != ""])      
        
        for c in list(similarweb_websites.keys()):
            countries[c] |= row[c]

        for c in list(categories.keys()):
            cats[c] |= row[c]
            
        if row.has_found_banner:
            has_found_banner=True
        
    trackers_only_click = trackers_click - trackers_first
    
    trackers_acc = set()
    trackers_incremental = []
    for s in trackers_details:
        trackers_acc |= s
        trackers_incremental.append(len(trackers_acc))
    
    return pd.Series({"count": num,
                      "trackers_first": trackers_first,
                      "trackers_click": trackers_click,
                      "trackers_only_click": trackers_only_click,
                      "trackers_internal": trackers_internal,
                      
                      "trackers_first_all": trackers_first_all,
                      "trackers_click_all": trackers_click_all,
                      "trackers_internal_all": trackers_internal_all,
                      
                      "trackers_incremental_nb": trackers_incremental,
                    
                      "tp_first": tp_first,
                      "tp_click": tp_click,
                      "has_found_banner": has_found_banner,
                      **countries, **cats
                     })
# grouped = df.groupby(["sld"]).apply(website_stats).reset_index()
grouped = df.groupby(["sld"]).apply(website_stats,include_groups=False).reset_index()
grouped

Unnamed: 0,sld,count,trackers_first,trackers_click,trackers_only_click,trackers_internal,trackers_first_all,trackers_click_all,trackers_internal_all,trackers_incremental_nb,...,Jobs_and_Career,Law_and_Government,News_and_Media,Pets_and_Animals,Reference_Materials,Science_and_Education,Sports,Travel_and_Tourism,Vehicles,Lifestyle
0,123rf.com,1,"{twitter.com, tiktok.com, quantserve.com, doub...","{twitter.com, tiktok.com, quantserve.com, doub...",{},"{twitter.com, tiktok.com, quantserve.com, doub...","{quantserve.com, tiktok.com, twitter.com, doub...","{quantserve.com, tiktok.com, twitter.com, doub...","{quantserve.com, tiktok.com, twitter.com, doub...",[4],...,False,False,False,False,False,False,False,False,False,False
1,16personalities.com,1,{},{},{},{},{},{},{},[0],...,True,False,False,False,False,False,False,False,False,False
2,1hd.to,1,{rtmark.net},{rtmark.net},{},{rtmark.net},{rtmark.net},{rtmark.net},{rtmark.net},[1],...,False,False,False,False,False,False,False,False,False,False
3,23andme.com,1,{},"{semasio.net, adform.net, bidswitch.net, bluek...","{semasio.net, adform.net, bidswitch.net, bluek...","{semasio.net, adform.net, bidswitch.net, bluek...",{},"{semasio.net, adform.net, bidswitch.net, bluek...","{semasio.net, adform.net, bidswitch.net, bluek...",[15],...,False,False,False,False,False,False,False,False,False,False
4,24timezones.com,1,{doubleclick.net},{doubleclick.net},{},{doubleclick.net},{doubleclick.net},{doubleclick.net},{doubleclick.net},[1],...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1866,zoe.com,1,{},"{twitter.com, bing.com, tiktok.com, clarity.ms}","{twitter.com, bing.com, tiktok.com, clarity.ms}","{twitter.com, bing.com, tiktok.com, clarity.ms}",{},"{bing.com, tiktok.com, twitter.com, clarity.ms}","{bing.com, tiktok.com, twitter.com, clarity.ms}",[4],...,False,False,False,False,False,False,False,False,False,False
1867,zoom.us,1,{},"{zoominfo.com, g2crowd.com, pubmatic.com, bing...","{zoominfo.com, g2crowd.com, pubmatic.com, bing...","{zoominfo.com, g2crowd.com, pubmatic.com, bing...",{},"{zoominfo.com, g2crowd.com, pubmatic.com, bing...","{zoominfo.com, g2crowd.com, pubmatic.com, bing...",[14],...,False,False,False,False,False,False,False,False,False,False
1868,zoopla.co.uk,1,{},{},{},{},{},{},{},[0],...,False,False,False,False,False,False,False,False,False,False
1869,zooplus.co.uk,1,{},"{demdex.net, criteo.com, doubleclick.net}","{criteo.com, doubleclick.net, demdex.net}","{demdex.net, criteo.com, bing.com, doubleclick...",{},"{criteo.com, doubleclick.net, demdex.net}","{criteo.com, doubleclick.net, bing.com, demdex...",[3],...,False,False,False,True,False,False,False,False,False,False


In [13]:
print(similarweb_websites.keys())

dict_keys(['UK'])


In [14]:
plot_data = []
plot_data_error = []

for category in categories.keys():
    this_grouped = grouped[grouped[category] == True]
    this_grouped = this_grouped [ this_grouped[[COUNTRY_NAME]].any(axis=1) ] 
    tracker_first = this_grouped[this_grouped["trackers_click"].apply(lambda s: len(s) >= 0 ) ]["trackers_first"].apply(len).mean()
    tracker_click = this_grouped[this_grouped["trackers_click"].apply(lambda s: len(s) >= 0 ) ]["trackers_click"].apply(len).mean()
    tracker_internal = this_grouped[this_grouped["trackers_internal"].apply(lambda s: len(s) >= 0 ) ]["trackers_internal"].apply(len).mean()
    
    plot_data.append({"category": category,
                     BEFORE_STR: tracker_first,
                      AFTER_STR: tracker_click,
                      ADDITIONAL_STR: tracker_internal,
                     })
    
    tracker_first = this_grouped[this_grouped["trackers_click"].apply(lambda s: len(s) >= 0 ) ]["trackers_first"].apply(len)
    tracker_click = this_grouped[this_grouped["trackers_click"].apply(lambda s: len(s) >= 0 ) ]["trackers_click"].apply(len)
    tracker_internal = this_grouped[this_grouped["trackers_internal"].apply(lambda s: len(s) >= 0 ) ]["trackers_internal"].apply(len)
      
    plot_data_error.append({"category": category,
                      BEFORE_STR: (round(np.mean(tracker_first) - 1.95*np.std(tracker_first)/np.sqrt(len(tracker_first)),2),
                                   round(np.mean(tracker_first) + 1.95*np.std(tracker_first)/np.sqrt(len(tracker_first)),2)),
                      AFTER_STR: (round(np.mean(tracker_click) - 1.95*np.std(tracker_click)/np.sqrt(len(tracker_click)),2),
                                   round(np.mean(tracker_click) + 1.95*np.std(tracker_click)/np.sqrt(len(tracker_click)),2)),
                      ADDITIONAL_STR: (round(np.mean(tracker_internal) - 1.95*np.std(tracker_click)/np.sqrt(len(tracker_internal)),2),
                                   round(np.mean(tracker_internal) + 1.95*np.std(tracker_click)/np.sqrt(len(tracker_internal)),2)),
                     })
    
plot_df = pd.DataFrame(plot_data)
plot_df.index = plot_df["category"]
plot_df = plot_df.sort_values(by = BEFORE_STR, ascending=False)

del plot_df["category"]
plot = fastplot.plot( plot_df[[BEFORE_STR, AFTER_STR, ADDITIONAL_STR] ] ,None, 
               mode='bars_multi', ylabel = '\\begin{center}Average Trackers\\\\Per Website\\end{center}', xticks_rotate = 30,  xticks_fontsize = "small",
               legend = True, legend_loc='upper right', legend_ncol=3,
               grid_axis="y",yticks = (np.arange(0,55,5), None),
               ylim = (0,50), **PLOT_ARGS_LARGE)

for p in plot.gca().patches[:len(plot_df)] + [plot.gca().get_legend().get_patches()[0]]:
    p.set_hatch("\\\\\\\\\\")
    p.set_edgecolor ("red")
    p.set_facecolor ("white")
for p in plot.gca().patches[len(plot_df):2*len(plot_df)] + [plot.gca().get_legend().get_patches()[1]]:
    p.set_hatch("/////")
    p.set_edgecolor ("blue")
    p.set_facecolor ("white")

for p in plot.gca().patches[2*len(plot_df):] + [plot.gca().get_legend().get_patches()[2]]:
    p.set_hatch("----")
    p.set_edgecolor ("green")
    p.set_facecolor ("white")
    

for i, tup in enumerate(plot_df.itertuples()):
    for d in plot_data_error:
        if d["category"] == tup.Index:
            this_d = d
    plot.gca().plot( (i-0.2,i-0.2), (this_d[BEFORE_STR]), linestyle="-", color="black"  )
    plot.gca().plot( (i,i), (this_d[AFTER_STR]), linestyle="-", color="black" )
    plot.gca().plot( (i+0.2,i+0.2), (this_d[ADDITIONAL_STR]), linestyle="-", color="black" )
plot.savefig(f"plots/{plot_country_name}_all_trackers_per_website_category.pdf")
plot.show()
plot_df["Increase"] = plot_df[AFTER_STR] - plot_df[BEFORE_STR]



  plot.show()


In [15]:
plot_data = []
plot_data_error = []

for category in categories.keys():
    this_grouped = grouped[grouped[category] == True]
    this_grouped = this_grouped [ this_grouped[[COUNTRY_NAME]].any(axis=1) ] 
    tracker_internal = this_grouped[this_grouped["trackers_internal"].apply(lambda s: len(s) >= 0 ) ]["trackers_internal"].apply(len).mean()
    
    plot_data.append({"category": category,
                      ADDITIONAL_STR: tracker_internal,
                     })
    
    tracker_internal = this_grouped[this_grouped["trackers_internal"].apply(lambda s: len(s) >= 0 ) ]["trackers_internal"].apply(len)
      
    plot_data_error.append({"category": category,
                      ADDITIONAL_STR: (round(np.mean(tracker_internal) - 1.95*np.std(tracker_click)/np.sqrt(len(tracker_internal)),2),
                                   round(np.mean(tracker_internal) + 1.95*np.std(tracker_click)/np.sqrt(len(tracker_internal)),2)),
                     })
    
plot_df = pd.DataFrame(plot_data)
plot_df.index = plot_df["category"]
plot_df = plot_df.sort_values(by = ADDITIONAL_STR, ascending=False)

del plot_df["category"]
plot = fastplot.plot( plot_df[[ADDITIONAL_STR] ] ,None, 
               mode='bars_multi', ylabel = '\\begin{center}Average Trackers\\\\Per Website\\end{center}', xticks_rotate = 30,  xticks_fontsize = "small",
               legend = True, legend_loc='upper right', legend_ncol=3,
               grid_axis="y",yticks = (np.arange(0,55,5), None),
               ylim = (0,50), **PLOT_ARGS_LARGE)

plot.suptitle(plot_country_name)

for p in plot.gca().patches[:len(plot_df)] + [plot.gca().get_legend().get_patches()[0]]:
    p.set_hatch("/////")
    p.set_edgecolor ("red")
    p.set_facecolor ("white")
    

for i, tup in enumerate(plot_df.itertuples()):
    for d in plot_data_error:
        if d["category"] == tup.Index:
            this_d = d
plot.savefig(f"plots/{plot_country_name}_trackers_per_website_category.pdf")
plot.show()



  plot.show()
