### Libraries

In [3]:
from pathlib import Path
from bs4 import BeautifulSoup

import pandas as pd
import json
import tqdm
import re

In [4]:
import warnings
warnings.filterwarnings("ignore")

### Read

In [6]:
data_dir = Path("/home/ozzie/Downloads/OPP-115_v1_0/OPP-115")
html_paths = (data_dir / "sanitized_policies").glob("*.html")
html_paths_map = {html_path.stem: html_path for html_path in html_paths}
annot_paths = (data_dir / "annotations").glob("*.csv")
annot_paths_map = {annot_path.stem: annot_path for annot_path in annot_paths}
print(f"Html files: {len(html_paths_map)}, annotation files: {len(annot_paths_map)}")

Html files: 115, annotation files: 115


In [7]:
paths_map = {}
for name in html_paths_map:
    paths_map[name] = {
        "annotation": annot_paths_map[name],
        "html": html_paths_map[name] 
    }
print(f"Total matched annotations: {len(paths_map)}")

Total matched annotations: 115


### Category Analysis

In [8]:
unique_cats = {}
for _, lookup_dict in tqdm.tqdm(paths_map.items()):
    annot_df = pd.read_csv(lookup_dict["annotation"], header=None)
    for i in range(len(annot_df)):
        annot_cat = annot_df[[5, 6]].values[i][0]
        label_str = annot_df[[5, 6]].values[i][1]
        label_dict = json.loads(label_str)
        unique_sub_cats = set(label_dict.keys())
        if annot_cat not in unique_cats:
            unique_cats[annot_cat] = unique_sub_cats
        else:
            unique_cats[annot_cat].update(unique_sub_cats)

100%|██████████| 115/115 [00:09<00:00, 12.48it/s]


### Sentence-Level Annotation

In [9]:
unique_cats.keys()

dict_keys(['Third Party Sharing/Collection', 'Other', 'First Party Collection/Use', 'User Choice/Control', 'Policy Change', 'Data Retention', 'User Access, Edit and Deletion', 'Data Security', 'International and Specific Audiences', 'Do Not Track'])

In [10]:
n_gdpr = 7
opp_to_gdpr = {
    "Other": [],
    "First Party Collection/Use": [0, 1, 2],
    "Third Party Sharing/Collection": [0, 1, 2],
    "Data Security": [5],
    "Policy Change": [0],
    "User Choice/Control": [0],
    "Data Retention": [4],
    "International and Specific Audiences": [0],
    "User Access, Edit and Deletion": [0, 3],
    "Do Not Track": []
}

In [11]:
def preprocess(s: str) -> str:
    s = re.sub(r"www\.\w+\.((com)|(net))", "", s)  # remove links
    s = re.sub(r"[^a-zA-Z0-9_\.]+", " ", s)  # remove spaces and \t\n\s
    return s

preprocess("123 123     one one. tow tow. plus \t \t kek www.sus.net")

'123 123 one one. tow tow. plus kek '

In [12]:
data = []
for name, lookup_dict in tqdm.tqdm(paths_map.items()):
    
    # read policy text
    with open(lookup_dict["html"], "r") as f:
        html_content = f.read()
    soup = BeautifulSoup(html_content, "html.parser")
    text = soup.get_text()
    text = preprocess(s=text)
    # construct starts and ends of sentences
    sentence_borders = []
    prev_start = 0
    for match in re.finditer(r"\.", text):
        _, end = match.span()
        sentence_borders.append((prev_start, end))
        prev_start = end
    if len(text) != prev_start:
        sentence_borders.append((prev_start, len(text)))
    # create holder for sentence labels 
    sentence_labels = [[0]*n_gdpr for _ in range(len(sentence_borders))]
        
    # read annotations for policy
    annot_df = pd.read_csv(lookup_dict["annotation"], header=None)
    for cat, annot in annot_df[[5, 6]].values:  # annotation-level
        annot_dict = json.loads(annot)
        for segment in annot_dict.values():  # sub-categories level
            
            # retrieve the text segment of annotation
            if "selectedText" not in segment:
                continue
            segment_html = segment["selectedText"]
            found_ban_word = False
            for ban_text in ("Not selected", "null"):
                if segment_html == ban_text:
                    found_ban_word = True
            if found_ban_word:
                continue
            segment_soup = BeautifulSoup(segment_html, "html.parser")
            segment_text = segment_soup.get_text()
            segment_text = preprocess(s=segment_text)
            
            # find the segment in the original policy text
            segment_start = text.find(segment_text)
            if segment_start == -1:
                raise ValueError(f"policy: {text}\n\nsegment: {segment_text}\n\n")
            segment_end = segment_start + len(segment["selectedText"])
            for sentence_id, (sentence_start, sentence_end) in enumerate(sentence_borders):  # looking for intersection with sentence
                if segment_end > sentence_start and sentence_end > segment_start:
                    for gdpr_cat_id in opp_to_gdpr[cat]:
                        sentence_labels[sentence_id][gdpr_cat_id] = min(sentence_labels[sentence_id][gdpr_cat_id]+1, 1) 
    
    # construct sentence-level information
    for sentence_id, ((sent_start, sent_end), sent_labels) in enumerate(zip(sentence_borders, sentence_labels)):
        data.append({
            "policy_name": name,
            "sentence_id": sentence_id,
            "sentence_text": text[sent_start:sent_end],
            "sentence_labels": sent_labels
        })

100%|██████████| 115/115 [00:04<00:00, 23.27it/s]


### Dump

In [14]:
df = pd.DataFrame.from_records(data)
df["sentence_length"] = df["sentence_text"].apply(lambda s: len(s.split(" ")))
df["is_included"] = df["sentence_length"] >= 5
df

Unnamed: 0,policy_name,sentence_id,sentence_text,sentence_labels,sentence_length,is_included
0,898_uptodate.com,0,UpToDate online privacy policy UpToDate Inc.,"[0, 0, 0, 0, 0, 0, 0]",6,True
1,898_uptodate.com,1,is very sensitive to the privacy needs of its...,"[0, 0, 0, 0, 0, 0, 0]",25,True
2,898_uptodate.com,2,UpToDate does not sell or otherwise share sub...,"[1, 1, 1, 0, 0, 0, 0]",16,True
3,898_uptodate.com,3,To better understand UpToDate s online privac...,"[1, 1, 1, 0, 0, 0, 0]",15,True
4,898_uptodate.com,4,Subscriber Information UpToDate never automat...,"[1, 1, 1, 0, 0, 0, 0]",22,True
...,...,...,...,...,...,...
12274,414_washingtonian.com,102,Effective Date This Privacy Policy is effecti...,"[1, 0, 0, 0, 0, 0, 0]",13,True
12275,701_tangeroutlet.com,0,Privacy Policy TangerOutlets is committed to k...,"[0, 0, 0, 0, 0, 1, 0]",11,True
12276,701_tangeroutlet.com,1,Any and all personal identifiable information...,"[1, 1, 1, 0, 0, 1, 0]",28,True
12277,701_tangeroutlet.com,2,If at any time you want your email informatio...,"[1, 1, 1, 0, 0, 1, 0]",23,True


In [15]:
# Count the number of 1s at each position
n_gdpr = 7  # number of GDPR categories

# Initialize a list to keep count for each index
index_counts = [0] * n_gdpr

# Iterate over each row and each index in the sentence_labels
for labels in df['sentence_labels']:
    for i in range(n_gdpr):
        if labels[i] == 1:
            index_counts[i] += 1

# Display the results
for i, count in enumerate(index_counts):
    print(f"Index {i} (GDPR Category {i+1}) count of 1s: {count}")

Index 0 (GDPR Category 1) count of 1s: 9342
Index 1 (GDPR Category 2) count of 1s: 6682
Index 2 (GDPR Category 3) count of 1s: 6682
Index 3 (GDPR Category 4) count of 1s: 677
Index 4 (GDPR Category 5) count of 1s: 410
Index 5 (GDPR Category 6) count of 1s: 1084
Index 6 (GDPR Category 7) count of 1s: 0


In [189]:
df.to_csv("../data/sentences_gdpr_labels.csv", index=False)