In [45]:
import numpy as np
import pandas as pd
import sqlite3
import sys
import os
import re
from urllib.parse import urlparse

In [46]:
# helper functions
def get_host_from_headers(x):
    temp = x.replace('"','').split('],[')
    for t in temp:
            pair = t.replace(']]','').replace('[[','').split(',')
            if "Host" in pair:
                host_value = pair[1]
    return host_value

def get_cookies_from_headers(x):
    ls = x.replace('"','').split('],[')
    for l in ls:
        ta = l.replace(']]','').split(',')
        if "Cookie" in ta:
            return ta[-1]
    return ''
def extract_host_from_url(url_ls):
    return list(map(lambda x: urlparse(x).netloc.split('.')[1] if len(urlparse(x).netloc.split('.')) > 1 else x, url_ls))

def get_site(site_url):
    return site_url.split(".")[1]

def is_third_party(origin, url_host):
    if origin in url_host:
        return False
    return True

def save_df_to_csv(df, file_name):
    df.to_csv(file_name+'.csv', index=False)

def extract_deep_clean_host(url):
    s = url.strip(".au").strip('.uk')
    return s.split('.')[-2] if len(s.split('.')) > 1 else s

In [47]:
data_path = 'exp-data/'
file_name = "nyt-t1"
sql_file = data_path + file_name + '.sqlite'

In [48]:
query = """SELECT sv.site_url, sv.visit_id,
        hr.headers, hr.referrer, hr.url
        FROM http_requests as hr LEFT JOIN site_visits as sv
        ON sv.visit_id = hr.visit_id
        """
# query = """SELECT sv.site_url, sv.visit_id,
#         hr.headers, hr.referrer, hr.url, hr.top_level_url
#         FROM http_requests as hr LEFT JOIN site_visits as sv
#         ON sv.visit_id = hr.visit_id
#         """

In [49]:
conn = sqlite3.connect(sql_file)
http_requests = pd.read_sql_query(query, conn)

In [50]:
http_requests["host"] = list(map(lambda x: get_host_from_headers(x), http_requests.headers.tolist()))
http_requests["cookies"] =  list(map(lambda x: get_cookies_from_headers(x), http_requests.headers.tolist()))
http_requests["origin_site"] = list(map(lambda x: get_site(x), http_requests.site_url))
http_requests["is_third_party"] = list(map(lambda a, b: is_third_party(a, b), http_requests.origin_site, http_requests.host))
http_requests['referrer'] = list(map(lambda x: urlparse(x).netloc, http_requests.referrer))
http_requests['referred_by_third'] = list(map(lambda a, b: is_third_party(a, b), http_requests.origin_site, http_requests.referrer))


In [55]:
data = http_requests[http_requests.is_third_party == True][["origin_site", \
                                                            "host", "referrer", "referred_by_third", "url", "cookies"]].reset_index(drop = True)

In [56]:
data

Unnamed: 0,origin_site,host,referrer,referred_by_third,url,cookies
0,webmd,assets.adobedtm.com,www.webmd.com,False,https://assets.adobedtm.com/2c8c1e17b98c/e6d47...,
1,webmd,s.flocdn.com,www.webmd.com,False,https://s.flocdn.com/@s1/embedded-search/embed...,
2,webmd,www.googleadservices.com,www.webmd.com,False,https://www.googleadservices.com/pagead/conver...,
3,webmd,fonts.googleapis.com,css.webmd.com,False,https://fonts.googleapis.com/css?family=Roboto...,
4,webmd,dyv1bugovvq1g.cloudfront.net,www.webmd.com,False,https://dyv1bugovvq1g.cloudfront.net/25/www.we...,
...,...,...,...,...,...,...
2571,nytimes,s0.2mdn.net,s0.2mdn.net,True,https://s0.2mdn.net/9847206/1606143779106/4151...,
2572,nytimes,s0.2mdn.net,s0.2mdn.net,True,https://s0.2mdn.net/9847206/1606143779106/4151...,
2573,nytimes,s0.2mdn.net,s0.2mdn.net,True,https://s0.2mdn.net/9847206/1606143779106/4151...,
2574,nytimes,dt.adsafeprotected.com,eb6611b34a41a87fee3f06cd0e100508.safeframe.goo...,True,https://dt.adsafeprotected.com/dt?advEntityId=...,


In [37]:
# all entries are referred by third party
# data[data.referred_by_third]

In [57]:
query2 = """SELECT sv.site_url, sv.visit_id,
        jsc.host, jsc.name, jsc.value
        FROM javascript_cookies as jsc LEFT JOIN site_visits as sv
        ON sv.visit_id = jsc.visit_id
        """
cookies = pd.read_sql_query(query2, conn)

In [58]:
cookies['origin_site'] = list(map(lambda x: get_site(x), cookies.site_url))
cookies['is_third_party'] = list(map(lambda a, b: is_third_party(a, b), cookies.origin_site, cookies.host))

In [59]:
cookies_data = cookies[["origin_site", "host", "name", "value", "is_third_party"]]
cookies_data

Unnamed: 0,origin_site,host,name,value,is_third_party
0,webmd,.www.webmd.com,__cfduid,de1f86f9e699cddfc6fcaeed623176bae1606506089,False
1,webmd,.js.webmd.com,__cfduid,d8d4ebd1e437ddcb310e31be0d82e1cdb1606506089,False
2,webmd,.img.webmd.com,__cfduid,d5b4e59d78d86d6c5ab29b6c3e948cd431606506089,False
3,webmd,.css.webmd.com,__cfduid,d74794a4d59fa87e1456c8b173f3ff20e1606506089,False
4,webmd,.webmd.com,test,cookie,False
...,...,...,...,...,...
2148,nytimes,.bluekai.com,bku,5LD99nOUyPI+cnRG,True
2149,nytimes,.bluekai.com,bku,5LD99nOUyPI+cnRG,True
2150,nytimes,.pubmatic.com,pi,156011:2,True
2151,nytimes,.ads.pubmatic.com,PMFREQ_ON,YES,True


In [60]:
cookie_fp = [re.escape(v) for v in set(cookies_data[cookies_data.is_third_party == False].value) if len(v) > 5]
cookie_tp = [re.escape(v) for v in set(cookies_data[cookies_data.is_third_party == True].value) if len(v) > 5]

cookie_fp_string = "|".join(cookie_fp)
cookie_tp_string = "|".join(cookie_tp)

In [61]:
cookie_values = [re.escape(v) for v in set(cookies.value) if len(v) > 5]
cookie_values_string = "|".join(cookie_values)

In [16]:
# len(cookie_values_string)

In [62]:
# http_requests_syncs = http_requests_third_party[http_requests_third_party.url.str.contains(fp_cookie_values, regex=True)]
# http_requests_syncs = http_requests_third_party[http_requests_third_party.url.str.contains(all_cookie_values, regex=True) | http_requests_third_party.referrer.str.contains(all_cookie_values, regex=True)]

cookie_syncs = data[data.url.str.contains(cookie_values_string, regex=True)]
cookie_syncs['share_fp_cookie'] = data.url.str.contains(cookie_fp_string, regex=True)
cookie_syncs['share_tp_cookie'] = data.url.str.contains(cookie_tp_string, regex=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [68]:
cookie_syncs.reset_index(drop = True)

Unnamed: 0,origin_site,host,referrer,referred_by_third,url,cookies,share_fp_cookie,share_tp_cookie
0,webmd,dyv1bugovvq1g.cloudfront.net,www.webmd.com,False,https://dyv1bugovvq1g.cloudfront.net/25/www.we...,,True,False
1,webmd,dpm.demdex.net,www.webmd.com,False,https://dpm.demdex.net/id?d_visid_ver=4.5.2&d_...,,False,True
2,webmd,sb.scorecardresearch.com,www.webmd.com,False,https://sb.scorecardresearch.com/b?c1=2&c2=603...,,True,True
3,webmd,s.tagsrvcs.com,www.webmd.com,False,https://s.tagsrvcs.com/2/587654/analytics.js?p...,,True,False
4,webmd,dpm.demdex.net,www.webmd.com,False,https://dpm.demdex.net/id/rd?d_visid_ver=4.5.2...,demdex=45429554768808693152241494766003155399,False,True
...,...,...,...,...,...,...,...,...
604,nytimes,cm.g.doubleclick.net,googleads.g.doubleclick.net,True,https://cm.g.doubleclick.net/pixel?google_nid=...,IDE=AHWqTUmi4RVj356DXmEaC4Hzv7dq2CVyPkwHafkI0q...,False,True
605,nytimes,image2.pubmatic.com,ads.pubmatic.com,True,https://image2.pubmatic.com/AdServer/Pug?vcode...,KTPCACOOKIE=YES; pi=156011:2; KADUSERCOOKIE=7F...,False,True
606,nytimes,pagead2.googlesyndication.com,eb6611b34a41a87fee3f06cd0e100508.safeframe.goo...,True,https://pagead2.googlesyndication.com/pagead/s...,,True,False
607,nytimes,p.adsymptotic.com,ads.pubmatic.com,True,https://p.adsymptotic.com/d/px?_pid=10291&_psi...,U=8404820d8b2b85b7e285c2b1d4449fa0,False,True


In [65]:
cookie_syncs[cookie_syncs.share_fp_cookie].reset_index(drop = True)

Unnamed: 0,origin_site,host,referrer,referred_by_third,url,cookies,share_fp_cookie,share_tp_cookie
0,webmd,dyv1bugovvq1g.cloudfront.net,www.webmd.com,False,https://dyv1bugovvq1g.cloudfront.net/25/www.we...,,True,False
1,webmd,sb.scorecardresearch.com,www.webmd.com,False,https://sb.scorecardresearch.com/b?c1=2&c2=603...,,True,True
2,webmd,s.tagsrvcs.com,www.webmd.com,False,https://s.tagsrvcs.com/2/587654/analytics.js?p...,,True,False
3,webmd,sb.scorecardresearch.com,www.webmd.com,False,https://sb.scorecardresearch.com/b2?c1=2&c2=60...,UID=16F18428a36a38a2808d1cc1606506090; UIDR=16...,True,True
4,webmd,mb.moatads.com,www.webmd.com,False,https://mb.moatads.com/yi.js?ud=1&qn=%604%7BZE...,,True,False
...,...,...,...,...,...,...,...,...
357,nytimes,securepubads.g.doubleclick.net,www.nytimes.com,False,https://securepubads.g.doubleclick.net/gampad/...,IDE=AHWqTUmi4RVj356DXmEaC4Hzv7dq2CVyPkwHafkI0q...,True,True
358,nytimes,securepubads.g.doubleclick.net,www.nytimes.com,False,https://securepubads.g.doubleclick.net/gampad/...,IDE=AHWqTUmi4RVj356DXmEaC4Hzv7dq2CVyPkwHafkI0q...,True,True
359,nytimes,cookiex.ngd.yahoo.com,eus.rubiconproject.com,True,https://cookiex.ngd.yahoo.com/ack?xid=4.FSGr1x...,A3=d=AQABBLVXwV8CEO2CuGwdUrBvVWgkVOVIuUIFEgEBA...,True,True
360,nytimes,p.adsymptotic.com,ads.pubmatic.com,True,https://p.adsymptotic.com/d/px?_pid=10291&_psi...,U=8404820d8b2b85b7e285c2b1d4449fa0,True,True


In [66]:
cookie_syncs[cookie_syncs.share_fp_cookie & cookie_syncs.referred_by_third].reset_index(drop = True)

Unnamed: 0,origin_site,host,referrer,referred_by_third,url,cookies,share_fp_cookie,share_tp_cookie
0,webmd,gum.criteo.com,gum.criteo.com,True,https://gum.criteo.com/sid/json?origin=rtus&do...,,True,False
1,webmd,p.adsymptotic.com,ads.pubmatic.com,True,https://p.adsymptotic.com/d/px?_pid=10291&_psi...,U=fbf29550bfb71972cda64a7d6535fdbb,True,True
2,webmd,pixel-sync.sitescout.com,bh.contextweb.com,True,https://pixel-sync.sitescout.com/dmp/pixelSync...,ssi=4df5df60-3216-4835-b3aa-8840de06bef1#16065...,True,False
3,webmd,match.prod.bidr.io,cdn.districtm.io,True,https://match.prod.bidr.io/cookie-sync/districtm,,True,False
4,webmd,match.prod.bidr.io,cdn.districtm.io,True,https://match.prod.bidr.io/cookie-sync/distric...,checkForPermission=ok,True,False
5,webmd,cookiex.ngd.yahoo.com,eus.rubiconproject.com,True,https://cookiex.ngd.yahoo.com/ack?xid=lejC.wIn...,A3=d=AQABBHFWwV8CEBavED6Nut--YHN9-9sAFrcFEgEBA...,True,True
6,webmd,em.licasd.com,i.liadm.com,True,https://em.licasd.com/s?l=c3615784-b77a-4d21-b...,,True,True
7,webmd,gum.criteo.com,gum.criteo.com,True,https://gum.criteo.com/sid/json?origin=rtus&do...,,True,False
8,webmd,gum.criteo.com,gum.criteo.com,True,https://gum.criteo.com/sid/json?origin=rtus&do...,,True,False
9,webmd,match.prod.bidr.io,i.liadm.com,True,https://match.prod.bidr.io/cookie-sync/liveintent,bito=AACsX06_gq0AAA9ws8SYJw; bitoIsSecure=ok,True,False


In [None]:
# http_requests_syncs[http_requests_syncs.share_tp_cookie].shape

In [None]:
# http_requests_syncs[http_requests_syncs.referred_by_third & http_requests_syncs.share_tp_cookie].shape

In [None]:
print(http_requests_syncs[http_requests_syncs.share_fp_cookie].shape[0])
print(http_requests_syncs[http_requests_syncs.share_fp_cookie & http_requests_syncs.referred_by_third].shape[0])
print(http_requests_syncs[http_requests_syncs.share_tp_cookie].shape[0])
print(http_requests_syncs[http_requests_syncs.share_tp_cookie & http_requests_syncs.referred_by_third].shape[0])

In [None]:
print(http_requests_syncs[http_requests_syncs.share_fp_cookie][http_requests_syncs.cookies==""].shape[0])
print(http_requests_syncs[http_requests_syncs.share_fp_cookie][http_requests_syncs.cookies==""][http_requests_syncs.referred_by_third].shape[0])
print(http_requests_syncs[http_requests_syncs.share_tp_cookie][http_requests_syncs.cookies==""].shape[0])
print(http_requests_syncs[http_requests_syncs.share_tp_cookie][http_requests_syncs.cookies==""][http_requests_syncs.referred_by_third].shape[0])

In [None]:
# http_requests_syncs[http_requests_syncs.is_diff_host & http_requests_syncs.share_tp_cookie].shape

In [None]:
# only index 3
print(http_requests_syncs[http_requests_syncs.visit_id==3][http_requests_syncs.share_fp_cookie].shape[0])
print(http_requests_syncs[http_requests_syncs.visit_id==3][http_requests_syncs.share_fp_cookie][http_requests_syncs.referred_by_third].shape[0])
print(http_requests_syncs[http_requests_syncs.visit_id==3][http_requests_syncs.share_tp_cookie].shape[0])
print(http_requests_syncs[http_requests_syncs.visit_id==3][http_requests_syncs.share_tp_cookie][http_requests_syncs.referred_by_third].shape[0])

In [None]:
# only index 3
print(http_requests_syncs[http_requests_syncs.visit_id==3][http_requests_syncs.cookies==""][http_requests_syncs.share_fp_cookie].shape[0])
print(http_requests_syncs[http_requests_syncs.visit_id==3][http_requests_syncs.cookies==""][http_requests_syncs.share_fp_cookie][http_requests_syncs.referred_by_third].shape[0])
print(http_requests_syncs[http_requests_syncs.visit_id==3][http_requests_syncs.cookies==""][http_requests_syncs.share_tp_cookie].shape[0])
print(http_requests_syncs[http_requests_syncs.visit_id==3][http_requests_syncs.cookies==""][http_requests_syncs.share_tp_cookie][http_requests_syncs.referred_by_third].shape[0])


In [None]:
# save_df_to_csv(http_requests_syncs, "cookie_syncs_" +file_name)