In [1]:
import pandas as pd
import glob
import os
import json
import numpy as np
from  cycler import cycler
import fastplot
from collections import Counter, defaultdict
from statistics import mode
import statsmodels.stats.proportion
import re

CYCLER_LINESPOINTS=(cycler('color', ['r', 'b', 'g', 'purple', 'c']) +
                    cycler('linestyle', ['-', '--', '-.', ':', (0, (3, 1, 1, 1)) ]) +
                    cycler('marker', ['o', 's', 'v', 'd', '^' ]))

CYCLER_LINES=(cycler('color', ['r', 'b', 'g', 'purple', 'c']) +
                    cycler('linestyle', ['-', '--', '-.', ':', (0, (3, 1, 1, 1)) ]))

PLOT_ARGS = {"style":"latex", "grid":True, "figsize":(4.25,2.5), "rcParams":{'text.latex.preamble': '\\usepackage{libertine}\n\\usepackage{setspace}'}, "legend_args":{"columnspacing":1, "handlelength":1.5}}
PLOT_ARGS_LARGE = {"style":"latex", "grid":True, "figsize":(9,2.75), "rcParams":{'text.latex.preamble': '\\usepackage{libertine}\n\\usepackage{setspace}'}, "legend_args":{"columnspacing":1, "handlelength":1.5}}

BEFORE_STR = "\\emph{Before-Accept}"
AFTER_STR = "\\emph{After-Accept}"
ADDITIONAL_STR = "\\emph{Additional-Visits}"

TRANSLATE_CATEGORIES = {"Computers Electronics and Technology": "Computers and Technology",
                        "Business and Consumer Services": "Business and Services",
                        "Heavy Industry and Engineering": "Industry and Engineering"}

COUNTRY_TRAD={"it":"Italy", "fr":"France", "de":"Germany", "es":"Spain", "uk":"UK", "us": "US"}

In [2]:
trackers = set( open("trackers-list.txt", "r").read().splitlines() )

similarweb_websites=defaultdict(set)
similarweb_websites['us'] = set(open("similarweb/USA.txt", "r" ).read().splitlines()) 
    

In [3]:
bad_domains=set("co.uk co.jp co.hu co.il com.au co.ve .co.in com.ec com.pk co.th co.nz com.br com.sg com.sa \
com.do co.za com.hk com.mx com.ly com.ua com.eg com.pe com.tr co.kr com.ng com.pe com.pk co.th \
com.au com.ph com.my com.tw com.ec com.kw co.in co.id com.com com.vn com.bd com.ar \
com.co com.vn org.uk net.gr".split())

def getGood2LD(fqdn):
    if fqdn[-1] == ".":
        fqdn = fqdn[:-1]    
    names = fqdn.split(".")
    if ".".join(names[-2:]) in bad_domains:
        return get3LD(fqdn)
    tln_array = names[-2:]
    tln = ""
    for s in tln_array:
        tln = tln + "." + s
    return tln[1:]

def get3LD(fqdn):
    if fqdn[-1] == ".":
        fqdn = fqdn[:-1]
    names = fqdn.split(".")
    tln_array = names[-3:]
    tln = ""
    for s in tln_array:
        tln = tln + "." + s
    return tln[1:]

In [5]:
df = pd.read_csv("../output.csv")

In [6]:
print(df.iloc[0])

url                                               http://character.ai
landing_page                                    https://character.ai/
has_found_banner                                                False
cookie_first        [{'domain': 'www.google.com', 'expires': 17249...
cookie_click        [{'domain': 'www.google.com', 'expires': 17249...
cookie_internal     [{'domain': 'www.google.com', 'expires': 17249...
domains_first       ['static.cloudflareinsights.com', 'www.google....
domains_click       ['character.ai', 'o4504695552606208.ingest.sen...
Name: 0, dtype: object


In [7]:
df["sld"] = df["url"].apply(lambda u: getGood2LD(u.split("/")[2] ).strip() ) 
df["tld"] = df["url"].apply(lambda u: get3LD(u.split("/")[2] ).strip()  ) 

In [8]:
print(df.iloc[0])

url                                               http://character.ai
landing_page                                    https://character.ai/
has_found_banner                                                False
cookie_first        [{'domain': 'www.google.com', 'expires': 17249...
cookie_click        [{'domain': 'www.google.com', 'expires': 17249...
cookie_internal     [{'domain': 'www.google.com', 'expires': 17249...
domains_first       ['static.cloudflareinsights.com', 'www.google....
domains_click       ['character.ai', 'o4504695552606208.ingest.sen...
sld                                                      character.ai
tld                                                      character.ai
Name: 0, dtype: object


In [9]:
df['us'] = True
print(df.iloc[0])

url                                               http://character.ai
landing_page                                    https://character.ai/
has_found_banner                                                False
cookie_first        [{'domain': 'www.google.com', 'expires': 17249...
cookie_click        [{'domain': 'www.google.com', 'expires': 17249...
cookie_internal     [{'domain': 'www.google.com', 'expires': 17249...
domains_first       ['static.cloudflareinsights.com', 'www.google....
domains_click       ['character.ai', 'o4504695552606208.ingest.sen...
sld                                                      character.ai
tld                                                      character.ai
us                                                               True
Name: 0, dtype: object


In [14]:
similarweb_csv = pd.read_csv("similarweb/together.csv")

In [16]:
# adding categories to the dataframe
similarweb_csv["category"] = similarweb_csv["category"].apply(lambda s: TRANSLATE_CATEGORIES[s] if s in TRANSLATE_CATEGORIES else s)
categories = defaultdict(set)

for row in similarweb_csv.itertuples():
    if row.category != "ALL" and row.country in {"France", "USA", "UK"} :
        categories[row.category].add(row.website)
        
for category in categories:
    df[category] = (df.sld.isin(categories[category])) | (df.tld.isin(categories[category]))


In [17]:
print(df.iloc[0])

url                                                                   http://character.ai
landing_page                                                        https://character.ai/
has_found_banner                                                                    False
cookie_first                            [{'domain': 'www.google.com', 'expires': 17249...
cookie_click                            [{'domain': 'www.google.com', 'expires': 17249...
cookie_internal                         [{'domain': 'www.google.com', 'expires': 17249...
domains_first                           ['static.cloudflareinsights.com', 'www.google....
domains_click                           ['character.ai', 'o4504695552606208.ingest.sen...
sld                                                                          character.ai
tld                                                                          character.ai
us                                                                                   True
Adult     

In [18]:
# getting the trackers for before, after and internal banner.
# Note that only checking the tracker that is in the tracker list and not expried
def get_trackers(s):
    d = json.loads(s)
    this_trackers = set()
    for c in d:
        if (c["domain"] in trackers or getGood2LD(c["domain"]) in trackers or get3LD(c["domain"]) in trackers) and c["expires"] > 0:
            this_trackers.add(getGood2LD(c["domain"]))
    return list(this_trackers)

df["trackers_first"] = df["cookie_first"].apply(get_trackers)
df["trackers_click"] = df["cookie_click"].apply(get_trackers)
df["trackers_internal"] = df["cookie_internal"].apply(get_trackers)

JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 3 (char 2)