In [6]:
import numpy as np
import pandas as pd
import sqlite3
import sys
import os
import re
from urllib.parse import urlparse

In [16]:
# helper functions
def get_host_from_headers(x):
    temp = x.replace('"','').split('],[')
    for t in temp:
            pair = t.replace(']]','').replace('[[','').split(',')
            if "Host" in pair:
                host_value = pair[1]
    return host_value

def get_cookies_from_headers(x):
    ls = x.replace('"','').split('],[')
    for l in ls:
        ta = l.replace(']]','').split(',')
        if "Cookie" in ta:
            return ta[-1]
    return ''
def extract_host_from_url(url_ls):
    return list(map(lambda x: urlparse(x).netloc.split('.')[1] if len(urlparse(x).netloc.split('.')) > 1 else x, url_ls))

def get_site(site_url):
    return site_url.split(".")[1]

def is_third_party(origin, url_host):
    if origin in url_host:
        return False
    return True

def save_df_to_csv(df, file_name):
    df.to_csv(file_name+'.csv', index=False)

def extract_deep_clean_host(url):
    s = url.strip(".au").strip('.uk')
    return s.split('.')[-2] if len(s.split('.')) > 1 else s

In [7]:
data_path = 'exp-data/'
file_name = "nyt-t1"
sql_file = data_path + file_name + '.sqlite'

In [13]:
query = """SELECT sv.site_url, sv.visit_id,
        hr.headers, hr.referrer, hr.url
        FROM http_requests as hr LEFT JOIN site_visits as sv
        ON sv.visit_id = hr.visit_id
        """
# query = """SELECT sv.site_url, sv.visit_id,
#         hr.headers, hr.referrer, hr.url, hr.top_level_url
#         FROM http_requests as hr LEFT JOIN site_visits as sv
#         ON sv.visit_id = hr.visit_id
#         """

In [14]:
conn = sqlite3.connect(sql_file)
http_requests = pd.read_sql_query(query, conn)

In [17]:
http_requests["host"] = list(map(lambda x: get_host_from_headers(x), http_requests.headers.tolist()))
http_requests["cookies"] =  list(map(lambda x: get_cookies_from_headers(x), http_requests.headers.tolist()))
http_requests["origin_site"] = list(map(lambda x: get_site(x), http_requests.site_url))
http_requests["is_third_party"] = list(map(lambda a, b: is_third_party(a, b), http_requests.origin_site, http_requests.host))

In [38]:
http_requests.shape

(3290, 9)

In [41]:
data = http_requests[http_requests.is_third_party == True][["origin_site", "host", "referrer", "url", "cookies"]].reset_index(drop = True)

In [42]:
data

Unnamed: 0,origin_site,host,referrer,url,cookies
0,webmd,assets.adobedtm.com,https://www.webmd.com/search/search_results/de...,https://assets.adobedtm.com/2c8c1e17b98c/e6d47...,
1,webmd,s.flocdn.com,https://www.webmd.com/search/search_results/de...,https://s.flocdn.com/@s1/embedded-search/embed...,
2,webmd,www.googleadservices.com,https://www.webmd.com/search/search_results/de...,https://www.googleadservices.com/pagead/conver...,
3,webmd,fonts.googleapis.com,https://css.webmd.com/dtmcms/live/webmd/PageBu...,https://fonts.googleapis.com/css?family=Roboto...,
4,webmd,dyv1bugovvq1g.cloudfront.net,https://www.webmd.com/search/search_results/de...,https://dyv1bugovvq1g.cloudfront.net/25/www.we...,
...,...,...,...,...,...
2571,nytimes,s0.2mdn.net,https://s0.2mdn.net/9847206/1606143779106/4151...,https://s0.2mdn.net/9847206/1606143779106/4151...,
2572,nytimes,s0.2mdn.net,https://s0.2mdn.net/9847206/1606143779106/4151...,https://s0.2mdn.net/9847206/1606143779106/4151...,
2573,nytimes,s0.2mdn.net,https://s0.2mdn.net/9847206/1606143779106/4151...,https://s0.2mdn.net/9847206/1606143779106/4151...,
2574,nytimes,dt.adsafeprotected.com,https://eb6611b34a41a87fee3f06cd0e100508.safef...,https://dt.adsafeprotected.com/dt?advEntityId=...,


In [33]:
query2 = """SELECT sv.site_url, sv.visit_id,
        jsc.host, jsc.name, jsc.value
        FROM javascript_cookies as jsc LEFT JOIN site_visits as sv
        ON sv.visit_id = jsc.visit_id
        """
cookies = pd.read_sql_query(query2, conn)

In [34]:
cookies

Unnamed: 0,site_url,visit_id,host,name,value
0,https://www.webmd.com/search/search_results/de...,1,.www.webmd.com,__cfduid,de1f86f9e699cddfc6fcaeed623176bae1606506089
1,https://www.webmd.com/search/search_results/de...,1,.js.webmd.com,__cfduid,d8d4ebd1e437ddcb310e31be0d82e1cdb1606506089
2,https://www.webmd.com/search/search_results/de...,1,.img.webmd.com,__cfduid,d5b4e59d78d86d6c5ab29b6c3e948cd431606506089
3,https://www.webmd.com/search/search_results/de...,1,.css.webmd.com,__cfduid,d74794a4d59fa87e1456c8b173f3ff20e1606506089
4,https://www.webmd.com/search/search_results/de...,1,.webmd.com,test,cookie
...,...,...,...,...,...
2148,https://www.nytimes.com/,3,.bluekai.com,bku,5LD99nOUyPI+cnRG
2149,https://www.nytimes.com/,3,.bluekai.com,bku,5LD99nOUyPI+cnRG
2150,https://www.nytimes.com/,3,.pubmatic.com,pi,156011:2
2151,https://www.nytimes.com/,3,.ads.pubmatic.com,PMFREQ_ON,YES


In [10]:
cookies['origin_site'] = list(map(lambda x: get_site(x), cookies.site_url))
cookies['is_third_party'] = list(map(lambda a, b: is_third_party(a, b), cookies.origin_site, cookies.host))

In [11]:
cookies_first_party = cookies[cookies.is_third_party == False]
cookies_third_party = cookies[cookies.is_third_party == True]

In [12]:
cookie_fp_effective = [re.escape(v) for v in set(cookies_first_party.value) if len(v) > 5]
cookie_tp_effective = [re.escape(v) for v in set(cookies_third_party.value) if len(v) > 5]
cookie_effective_all = [re.escape(v) for v in set(cookies.value) if len(v) > 5]

In [13]:
fp_cookie_values = "|".join(cookie_fp_effective)
tp_cookie_values = "|".join(cookie_tp_effective)
all_cookie_values = "|".join(cookie_effective_all)

In [41]:
# len(tp_cookie_values)

In [18]:
http_requests_third_party.shape

(2576, 10)

In [16]:
# http_requests_syncs = http_requests_third_party[http_requests_third_party.url.str.contains(fp_cookie_values, regex=True)]
http_requests_syncs = http_requests_third_party[http_requests_third_party.url.str.contains(all_cookie_values, regex=True) | http_requests_third_party.referrer.str.contains(all_cookie_values, regex=True)]

In [17]:
http_requests_syncs.shape

(1236, 10)

In [112]:
# a = http_requests_third_party.url.str.contains(fp_cookie_values, regex=True).index
# b = http_requests_third_party[http_requests_third_party.url.str.contains(fp_cookie_values, regex=True)]

In [113]:
# b

In [114]:
# b

In [44]:
http_requests_syncs['share_fp_cookie'] = http_requests_third_party.url.str.contains(fp_cookie_values, regex=True)
http_requests_syncs['share_tp_cookie'] = http_requests_third_party.url.str.contains(tp_cookie_values, regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [45]:
# extract third party
http_requests_syncs['referrer_host'] = list(map(lambda x: urlparse(x).netloc, http_requests_syncs.referrer))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [46]:
# can tell wether referrer and url are different
http_requests_syncs['is_diff_host'] = list(map(lambda a, b: a != b, http_requests_syncs.host, http_requests_syncs.referrer_host))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [47]:
http_requests_syncs['referred_by_third'] = list(map(lambda a, b: is_third_party(a, b), http_requests_syncs.origin_site, http_requests_syncs.referrer_host))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [48]:
http_requests_syncs = http_requests_syncs[http_requests_syncs.is_diff_host]
# 609 -> 576

In [49]:
http_requests_syncs.drop(columns=['site_url', 'headers','visit_id', 'is_diff_host'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [50]:
http_requests_syncs

Unnamed: 0,referrer,url,top_level_url,host,cookies,origin_site,is_third_party,share_fp_cookie,share_tp_cookie,referrer_host,is_diff_host,referred_by_third
38,https://www.webmd.com/search/search_results/de...,https://dyv1bugovvq1g.cloudfront.net/25/www.we...,https://www.webmd.com/search/search_results/de...,dyv1bugovvq1g.cloudfront.net,,webmd,True,True,False,www.webmd.com,True,False
47,https://www.webmd.com/search/search_results/de...,https://dpm.demdex.net/id?d_visid_ver=4.5.2&d_...,https://www.webmd.com/search/search_results/de...,dpm.demdex.net,,webmd,True,False,True,www.webmd.com,True,False
51,https://www.webmd.com/search/search_results/de...,https://sb.scorecardresearch.com/b?c1=2&c2=603...,https://www.webmd.com/search/search_results/de...,sb.scorecardresearch.com,,webmd,True,True,True,www.webmd.com,True,False
53,https://www.webmd.com/search/search_results/de...,https://s.tagsrvcs.com/2/587654/analytics.js?p...,https://www.webmd.com/search/search_results/de...,s.tagsrvcs.com,,webmd,True,True,False,www.webmd.com,True,False
68,https://www.webmd.com/search/search_results/de...,https://dpm.demdex.net/id/rd?d_visid_ver=4.5.2...,https://www.webmd.com/search/search_results/de...,dpm.demdex.net,demdex=45429554768808693152241494766003155399,webmd,True,False,True,www.webmd.com,True,False
69,https://www.webmd.com/search/search_results/de...,https://sb.scorecardresearch.com/b2?c1=2&c2=60...,https://www.webmd.com/search/search_results/de...,sb.scorecardresearch.com,UID=16F18428a36a38a2808d1cc1606506090; UIDR=16...,webmd,True,True,True,www.webmd.com,True,False
72,https://www.webmd.com/search/search_results/de...,https://mb.moatads.com/yi.js?ud=1&qn=%604%7BZE...,https://www.webmd.com/search/search_results/de...,mb.moatads.com,,webmd,True,True,False,www.webmd.com,True,False
75,https://www.webmd.com/search/search_results/de...,https://contextual.media.net/mcx.js?&rt=2&call...,https://www.webmd.com/search/search_results/de...,contextual.media.net,visitor-id=2495076903613114000V10,webmd,True,True,False,www.webmd.com,True,False
81,https://www.webmd.com/search/search_results/de...,https://contextual.media.net/rtbsmpubs.php?&gd...,https://www.webmd.com/search/search_results/de...,contextual.media.net,visitor-id=2495076903613114000V10,webmd,True,True,False,www.webmd.com,True,False
82,https://www.webmd.com/search/search_results/de...,https://contextual.media.net/rtbsmpubs.php?&gd...,https://www.webmd.com/search/search_results/de...,contextual.media.net,visitor-id=2495076903613114000V10,webmd,True,True,False,www.webmd.com,True,False


In [121]:
# http_requests_syncs[http_requests_syncs.share_fp_cookie].shape

In [122]:
# http_requests_syncs[http_requests_syncs.share_fp_cookie & http_requests_syncs.referred_by_third].shape

In [123]:
# http_requests_syncs[http_requests_syncs.share_tp_cookie].shape

In [124]:
# http_requests_syncs[http_requests_syncs.referred_by_third & http_requests_syncs.share_tp_cookie].shape

In [125]:
print(http_requests_syncs[http_requests_syncs.share_fp_cookie].shape[0])
print(http_requests_syncs[http_requests_syncs.share_fp_cookie & http_requests_syncs.referred_by_third].shape[0])
print(http_requests_syncs[http_requests_syncs.share_tp_cookie].shape[0])
print(http_requests_syncs[http_requests_syncs.share_tp_cookie & http_requests_syncs.referred_by_third].shape[0])

356
36
317
201


In [126]:
print(http_requests_syncs[http_requests_syncs.share_fp_cookie][http_requests_syncs.cookies==""].shape[0])
print(http_requests_syncs[http_requests_syncs.share_fp_cookie][http_requests_syncs.cookies==""][http_requests_syncs.referred_by_third].shape[0])
print(http_requests_syncs[http_requests_syncs.share_tp_cookie][http_requests_syncs.cookies==""].shape[0])
print(http_requests_syncs[http_requests_syncs.share_tp_cookie][http_requests_syncs.cookies==""][http_requests_syncs.referred_by_third].shape[0])

178
12
94
40


  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [127]:
# http_requests_syncs[http_requests_syncs.is_diff_host & http_requests_syncs.share_tp_cookie].shape

In [128]:
# only index 3
print(http_requests_syncs[http_requests_syncs.visit_id==3][http_requests_syncs.share_fp_cookie].shape[0])
print(http_requests_syncs[http_requests_syncs.visit_id==3][http_requests_syncs.share_fp_cookie][http_requests_syncs.referred_by_third].shape[0])
print(http_requests_syncs[http_requests_syncs.visit_id==3][http_requests_syncs.share_tp_cookie].shape[0])
print(http_requests_syncs[http_requests_syncs.visit_id==3][http_requests_syncs.share_tp_cookie][http_requests_syncs.referred_by_third].shape[0])

10
4
44
35


  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """


In [129]:
# only index 3
print(http_requests_syncs[http_requests_syncs.visit_id==3][http_requests_syncs.cookies==""][http_requests_syncs.share_fp_cookie].shape[0])
print(http_requests_syncs[http_requests_syncs.visit_id==3][http_requests_syncs.cookies==""][http_requests_syncs.share_fp_cookie][http_requests_syncs.referred_by_third].shape[0])
print(http_requests_syncs[http_requests_syncs.visit_id==3][http_requests_syncs.cookies==""][http_requests_syncs.share_tp_cookie].shape[0])
print(http_requests_syncs[http_requests_syncs.visit_id==3][http_requests_syncs.cookies==""][http_requests_syncs.share_tp_cookie][http_requests_syncs.referred_by_third].shape[0])


4
1
9
6


  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """


In [130]:
save_df_to_csv(http_requests_syncs, "cookie_syncs_" +file_name)