In [56]:
import numpy as np
import pandas as pd
import sqlite3
import sys
import os
import re
from urllib.parse import urlparse

In [57]:
# helper functions
def get_host_from_headers(x):
    temp = x.replace('"','').split('],[')
    for t in temp:
            pair = t.replace(']]','').replace('[[','').split(',')
            if "Host" in pair:
                host_value = pair[1]
    return host_value

def get_cookies_from_headers(x):
    ls = x.replace('"','').split('],[')
    for l in ls:
        ta = l.replace(']]','').split(',')
        if "Cookie" in ta:
            return ta[-1]
    return ''
def extract_host_from_url(url_ls):
    return list(map(lambda x: urlparse(x).netloc.split('.')[1] if len(urlparse(x).netloc.split('.')) > 1 else x, url_ls))

def get_site(site_url):
    return site_url.split(".")[1]

def is_third_party(origin, url_host):
    if origin in url_host:
        return False
    return True

def save_df_to_csv(df, file_name):
    df.to_csv(file_name+'.csv', index=False)

def extract_deep_clean_host(url):
    s = url.strip(".au").strip('.uk')
    return s.split('.')[-2] if len(s.split('.')) > 1 else s

In [59]:
query = """SELECT sv.site_url, sv.visit_id,
        hr.headers, hr.referrer, hr.url
        FROM http_requests as hr LEFT JOIN site_visits as sv
        ON sv.visit_id = hr.visit_id
        """
# query = """SELECT sv.site_url, sv.visit_id,
#         hr.headers, hr.referrer, hr.url, hr.top_level_url
#         FROM http_requests as hr LEFT JOIN site_visits as sv
#         ON sv.visit_id = hr.visit_id
#         """

In [60]:
conn = sqlite3.connect(sql_file)
http_requests = pd.read_sql_query(query, conn)

In [61]:
http_requests["host"] = list(map(lambda x: get_host_from_headers(x), http_requests.headers.tolist()))
http_requests["cookies"] =  list(map(lambda x: get_cookies_from_headers(x), http_requests.headers.tolist()))
http_requests["origin_site"] = list(map(lambda x: get_site(x), http_requests.site_url))
http_requests["is_third_party"] = list(map(lambda a, b: is_third_party(a, b), http_requests.origin_site, http_requests.host))
http_requests['referrer'] = list(map(lambda x: urlparse(x).netloc, http_requests.referrer))
http_requests['referred_by_third'] = list(map(lambda a, b: is_third_party(a, b), http_requests.origin_site, http_requests.referrer))


In [62]:
data = http_requests[http_requests.is_third_party == True][["origin_site", \
                                                            "host", "referrer", "referred_by_third", "url", "cookies"]].reset_index(drop = True)

In [63]:
data

Unnamed: 0,origin_site,host,referrer,referred_by_third,url,cookies
0,webmd,assets.adobedtm.com,www.webmd.com,False,https://assets.adobedtm.com/2c8c1e17b98c/e6d47...,
1,webmd,www.googleadservices.com,www.webmd.com,False,https://www.googleadservices.com/pagead/conver...,
2,webmd,fonts.googleapis.com,css.webmd.com,False,https://fonts.googleapis.com/css?family=Roboto...,
3,webmd,fonts.googleapis.com,css.webmd.com,False,https://fonts.googleapis.com/css?family=Roboto...,
4,webmd,dyv1bugovvq1g.cloudfront.net,www.webmd.com,False,https://dyv1bugovvq1g.cloudfront.net/25/www.we...,
...,...,...,...,...,...,...
2048,nytimes,cdn.ampproject.org,www.nytimes.com,False,https://cdn.ampproject.org/rtv/012010270040000...,
2049,nytimes,cdn.ampproject.org,www.nytimes.com,False,https://cdn.ampproject.org/rtv/012010270040000...,
2050,nytimes,cdn.ampproject.org,www.nytimes.com,False,https://cdn.ampproject.org/rtv/012010270040000...,
2051,nytimes,cdn.ampproject.org,www.nytimes.com,False,https://cdn.ampproject.org/rtv/012010270040000...,


In [64]:
# all entries are referred by third party
# data[data.referred_by_third]

In [65]:
query2 = """SELECT sv.site_url, sv.visit_id,
        jsc.host, jsc.name, jsc.value
        FROM javascript_cookies as jsc LEFT JOIN site_visits as sv
        ON sv.visit_id = jsc.visit_id
        """
cookies = pd.read_sql_query(query2, conn)

In [66]:
cookies['origin_site'] = list(map(lambda x: get_site(x), cookies.site_url))
cookies['is_third_party'] = list(map(lambda a, b: is_third_party(a, b), cookies.origin_site, cookies.host))

In [67]:
cookies_data = cookies[["origin_site", "host", "name", "value", "is_third_party"]]
cookies_data

Unnamed: 0,origin_site,host,name,value,is_third_party
0,webmd,.www.webmd.com,__cfduid,dd673b2700baa134208e3794c02eec1f41606506583,False
1,webmd,.webmd.com,lrt_wrk,lrt1_cached_k8_worker_1_37S_21O_2020-11-18_17:...,False
2,webmd,.webmd.com,gtinfo,"{""ct"":""Columbus"",""c"":""Franklin"",""cc"":""39049"",""...",False
3,webmd,.webmd.com,VisitorId,9831e05f-086b-4123-5987-f9b5df875967,False
4,webmd,.css.webmd.com,__cfduid,dc560e081e9789ea111079dc63ad819771606506583,False
...,...,...,...,...,...
1391,nytimes,.yahoo.com,GUC,AQEBAQFfwqtfy0IcvQQZ,True
1392,nytimes,.pubmatic.com,KTPCACOOKIE,YES,True
1393,nytimes,.analytics.yahoo.com,IDSYNC,18y3~1v0j,True
1394,nytimes,.ads.pubmatic.com,KCCH,YES,True


In [68]:
cookie_fp = [re.escape(v) for v in set(cookies_data[cookies_data.is_third_party == False].value) if len(v) > 5]
cookie_tp = [re.escape(v) for v in set(cookies_data[cookies_data.is_third_party == True].value) if len(v) > 5]

cookie_fp_string = "|".join(cookie_fp)
cookie_tp_string = "|".join(cookie_tp)

In [69]:
cookie_values = [re.escape(v) for v in set(cookies.value) if len(v) > 5]
cookie_values_string = "|".join(cookie_values)

In [70]:
# len(cookie_values_string)

In [71]:
# http_requests_syncs = http_requests_third_party[http_requests_third_party.url.str.contains(fp_cookie_values, regex=True)]
# http_requests_syncs = http_requests_third_party[http_requests_third_party.url.str.contains(all_cookie_values, regex=True) | http_requests_third_party.referrer.str.contains(all_cookie_values, regex=True)]

cookie_syncs = data[data.url.str.contains(cookie_values_string, regex=True)]
cookie_syncs['share_fp_cookie'] = data.url.str.contains(cookie_fp_string, regex=True)
cookie_syncs['share_tp_cookie'] = data.url.str.contains(cookie_tp_string, regex=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [72]:
# # Full table (both first party and third party)
cookie_syncs = cookie_syncs.reset_index(drop = True)

In [73]:
cookie_syncs[cookie_syncs.share_fp_cookie].reset_index(drop = True)

Unnamed: 0,origin_site,host,referrer,referred_by_third,url,cookies,share_fp_cookie,share_tp_cookie
0,webmd,dyv1bugovvq1g.cloudfront.net,www.webmd.com,False,https://dyv1bugovvq1g.cloudfront.net/25/www.we...,,True,False
1,webmd,sb.scorecardresearch.com,www.webmd.com,False,https://sb.scorecardresearch.com/b?c1=2&c2=603...,,True,True
2,webmd,s.tagsrvcs.com,www.webmd.com,False,https://s.tagsrvcs.com/2/587654/analytics.js?p...,,True,False
3,webmd,sb.scorecardresearch.com,www.webmd.com,False,https://sb.scorecardresearch.com/b2?c1=2&c2=60...,UID=12E18428a36a38a281b392c1606506584; UIDR=16...,True,True
4,webmd,mb.moatads.com,www.webmd.com,False,https://mb.moatads.com/yi.js?ud=1&qn=%604%7BZE...,,True,False
...,...,...,...,...,...,...,...,...
267,nytimes,stags.bluekai.com,www.nytimes.com,False,https://stags.bluekai.com/site/50136?limit=1&i...,bkdc=phx; bku=5LD99cNjyPI0ZERF,True,False
268,nytimes,adservice.google.com,5290727.fls.doubleclick.net,True,https://adservice.google.com/ddm/fls/z/src=529...,NID=204=yMh99bQ9AG11gArh5VZA32ANpUgvui-T9uZOxW...,True,False
269,nytimes,securepubads.g.doubleclick.net,www.nytimes.com,False,https://securepubads.g.doubleclick.net/gampad/...,IDE=AHWqTUli2Ug4RelkdVQsl6mdMiY18CRQy0CVBjgL0Z...,True,True
270,nytimes,securepubads.g.doubleclick.net,www.nytimes.com,False,https://securepubads.g.doubleclick.net/gampad/...,IDE=AHWqTUli2Ug4RelkdVQsl6mdMiY18CRQy0CVBjgL0Z...,True,True


In [74]:
# firsty party cookie syncing activity (only requests referred by third party domain)
cookie_syncs[cookie_syncs.share_fp_cookie & cookie_syncs.referred_by_third].reset_index(drop = True)

Unnamed: 0,origin_site,host,referrer,referred_by_third,url,cookies,share_fp_cookie,share_tp_cookie
0,webmd,pixel-sync.sitescout.com,ssum-sec.casalemedia.com,True,https://pixel-sync.sitescout.com/dmp/pixelSync...,ssi=b88b91ef-ad81-4e06-8d7b-8209bc57e509#16065...,True,False
1,webmd,i.liadm.com,i.liadm.com,True,https://i.liadm.com/s/e/a-00xm/0/4658be9f3b884...,_li_ss=MgUIBhCPDjIFCAoQjw4yBQh6EI4OMgYIiwEQjw4...,True,True
2,webmd,cookiex.ngd.yahoo.com,eus.rubiconproject.com,True,https://cookiex.ngd.yahoo.com/ack?xid=hTHA3OJw...,A3=d=AQABBF1YwV8CEJA2DW4Mr1L22fxCjYivhF8FEgEBA...,True,True
3,webmd,inv-nets.admixer.net,contextual.media.net,True,https://inv-nets.admixer.net/adxcm.aspx?ssp=D4...,,True,True
4,webmd,gum.criteo.com,gum.criteo.com,True,https://gum.criteo.com/sid/json?origin=rtus&do...,,True,False
5,webmd,www.facebook.com,www.facebook.com,True,https://www.facebook.com/platform/plugin/tab/r...,fr=0tq2xRdwGaoQ7d6ty..BfwVha...1.0.BfwVha.,True,False
6,webmd,p.adsymptotic.com,ads.pubmatic.com,True,https://p.adsymptotic.com/d/px?_pid=10291&_psi...,U=e1868f0bb69d8dc2e5b271a71a9cddbe,True,True
7,webmd,em.licasd.com,i.liadm.com,True,https://em.licasd.com/s?l=cc96dcdb-aca1-4198-a...,,True,True
8,webmd,gum.criteo.com,gum.criteo.com,True,https://gum.criteo.com/sid/json?origin=rtus&do...,,True,False
9,webmd,match.prod.bidr.io,i.liadm.com,True,https://match.prod.bidr.io/cookie-sync/liveintent,,True,False


In [75]:
cookie_syncs[cookie_syncs.share_tp_cookie].reset_index(drop = True)

Unnamed: 0,origin_site,host,referrer,referred_by_third,url,cookies,share_fp_cookie,share_tp_cookie
0,webmd,dpm.demdex.net,www.webmd.com,False,https://dpm.demdex.net/id?d_visid_ver=4.5.2&d_...,,False,True
1,webmd,sb.scorecardresearch.com,www.webmd.com,False,https://sb.scorecardresearch.com/b?c1=2&c2=603...,,True,True
2,webmd,dpm.demdex.net,www.webmd.com,False,https://dpm.demdex.net/id/rd?d_visid_ver=4.5.2...,demdex=43866217718121268141481295148017842948,False,True
3,webmd,sb.scorecardresearch.com,www.webmd.com,False,https://sb.scorecardresearch.com/b2?c1=2&c2=60...,UID=12E18428a36a38a281b392c1606506584; UIDR=16...,True,True
4,webmd,px.moatads.com,www.webmd.com,False,https://px.moatads.com/pixel.gif?e=17&t=160650...,,False,True
...,...,...,...,...,...,...,...,...
213,nytimes,pixel.rubiconproject.com,eus.rubiconproject.com,True,https://pixel.rubiconproject.com/tap.php?v=898...,khaos=KI0OPTTN-1I-L1MQ; audit=1|BZZ8oNzhrJA1Lr...,False,True
214,nytimes,ads.yahoo.com,eus.rubiconproject.com,True,https://ads.yahoo.com/cms/v1?nwid=10000010181&...,,False,True
215,nytimes,pixel.rubiconproject.com,eus.rubiconproject.com,True,https://pixel.rubiconproject.com/tap.php?v=422...,khaos=KI0OPTTN-1I-L1MQ; audit=1|BZZ8oNzhrJA1Lr...,False,True
216,nytimes,s.amazon-adsystem.com,,True,https://s.amazon-adsystem.com/ecm3?id=34949062...,ad-id=A7Vtk8xGAE4RoZw-YGFBwXo; ad-privacy=0,False,True


In [76]:
# third party cookie syncing activity (only requests referred by third party domain)
cookie_syncs[cookie_syncs.share_tp_cookie & cookie_syncs.referred_by_third].reset_index(drop = True)

Unnamed: 0,origin_site,host,referrer,referred_by_third,url,cookies,share_fp_cookie,share_tp_cookie
0,webmd,cm.g.doubleclick.net,ssum-sec.casalemedia.com,True,https://cm.g.doubleclick.net/pixel?google_nid=...,IDE=AHWqTUk5UiBOU0d2HbTSKf5GKEa93JnaoAS65cMmKz...,False,True
1,webmd,match.adsrvr.org,ssum-sec.casalemedia.com,True,https://match.adsrvr.org/track/cmf/casale?cm_u...,TDID=a9fd6414-644c-43cc-a830-07f62ae7204d; TDC...,False,True
2,webmd,s.amazon-adsystem.com,ssum-sec.casalemedia.com,True,https://s.amazon-adsystem.com/dcm?pid=78af914c...,ad-id=A9y_eemhaE1wgn6djABN3so; ad-privacy=0,False,True
3,webmd,ups.analytics.yahoo.com,ssum-sec.casalemedia.com,True,https://ups.analytics.yahoo.com/ups/55940/sync...,A3=d=AQABBF1YwV8CEJA2DW4Mr1L22fxCjYivhF8FEgEBA...,False,True
4,webmd,s.amazon-adsystem.com,ssum-sec.casalemedia.com,True,https://s.amazon-adsystem.com/ecm3?ex=index.co...,ad-id=A9y_eemhaE1wgn6djABN3so; ad-privacy=0,False,True
...,...,...,...,...,...,...,...,...
125,nytimes,pixel.rubiconproject.com,eus.rubiconproject.com,True,https://pixel.rubiconproject.com/tap.php?v=898...,khaos=KI0OPTTN-1I-L1MQ; audit=1|BZZ8oNzhrJA1Lr...,False,True
126,nytimes,ads.yahoo.com,eus.rubiconproject.com,True,https://ads.yahoo.com/cms/v1?nwid=10000010181&...,,False,True
127,nytimes,pixel.rubiconproject.com,eus.rubiconproject.com,True,https://pixel.rubiconproject.com/tap.php?v=422...,khaos=KI0OPTTN-1I-L1MQ; audit=1|BZZ8oNzhrJA1Lr...,False,True
128,nytimes,s.amazon-adsystem.com,,True,https://s.amazon-adsystem.com/ecm3?id=34949062...,ad-id=A7Vtk8xGAE4RoZw-YGFBwXo; ad-privacy=0,False,True


In [77]:
# filter conditions:
# share_tp_cookie
# referred_by_third
# empty cookie string
cookie_syncs[cookie_syncs.share_tp_cookie & cookie_syncs.referred_by_third & (cookie_syncs.cookies=="")].reset_index(drop = True)

Unnamed: 0,origin_site,host,referrer,referred_by_third,url,cookies,share_fp_cookie,share_tp_cookie
0,webmd,cm.smadex.com,ssum-sec.casalemedia.com,True,https://cm.smadex.com/sync?sm_did=bds&bds_ssp_...,,False,True
1,webmd,x.dlx.addthis.com,i.liadm.com,True,https://x.dlx.addthis.com/e/live_intent_sync?n...,,False,True
2,webmd,ads.pubmatic.com,contextual.media.net,True,https://ads.pubmatic.com/AdServer/js/user_sync...,,False,True
3,webmd,ads.pubmatic.com,ads.pubmatic.com,True,https://ads.pubmatic.com/AdServer/js/showad.js...,,False,True
4,webmd,inv-nets.admixer.net,contextual.media.net,True,https://inv-nets.admixer.net/adxcm.aspx?ssp=D4...,,True,True
5,webmd,visitor.fiftyt.com,ads.pubmatic.com,True,https://visitor.fiftyt.com/p.gif?ev=sync&p=pm&...,,False,True
6,webmd,pixel.onaudience.com,ads.pubmatic.com,True,https://pixel.onaudience.com/?partner=214&mapp...,,False,True
7,webmd,pagead2.googlesyndication.com,9fa56b9f2db81251d99ed756d6eadcc5.safeframe.goo...,True,https://pagead2.googlesyndication.com/pcs/acti...,,False,True
8,webmd,pagead2.googlesyndication.com,9fa56b9f2db81251d99ed756d6eadcc5.safeframe.goo...,True,https://pagead2.googlesyndication.com/pcs/acti...,,False,True
9,webmd,pagead2.googlesyndication.com,9fa56b9f2db81251d99ed756d6eadcc5.safeframe.goo...,True,https://pagead2.googlesyndication.com/pcs/acti...,,False,True
