### Libraries

In [1]:
from pathlib import Path
from bs4 import BeautifulSoup

import pandas as pd
import json
import tqdm
import re

In [2]:
import warnings
warnings.filterwarnings("ignore")

### Read

In [3]:
data_dir = Path("/home/ozzie/Downloads/OPP-115_v1_0 (1)/OPP-115")
html_paths = (data_dir / "sanitized_policies").glob("*.html")
html_paths_map = {html_path.stem: html_path for html_path in html_paths}
annot_paths = (data_dir / "annotations").glob("*.csv")
annot_paths_map = {annot_path.stem: annot_path for annot_path in annot_paths}
print(f"Html files: {len(html_paths_map)}, annotation files: {len(annot_paths_map)}")

Html files: 115, annotation files: 115


In [4]:
paths_map = {}
for name in html_paths_map:
    paths_map[name] = {
        "annotation": annot_paths_map[name],
        "html": html_paths_map[name] 
    }
print(f"Total matched annotations: {len(paths_map)}")

Total matched annotations: 115


### Category Analysis

In [5]:
unique_cats = {}
for _, lookup_dict in tqdm.tqdm(paths_map.items()):
    annot_df = pd.read_csv(lookup_dict["annotation"], header=None)
    for i in range(len(annot_df)):
        annot_cat = annot_df[[5, 6]].values[i][0]
        label_str = annot_df[[5, 6]].values[i][1]
        label_dict = json.loads(label_str)
        unique_sub_cats = set(label_dict.keys())
        if annot_cat not in unique_cats:
            unique_cats[annot_cat] = unique_sub_cats
        else:
            unique_cats[annot_cat].update(unique_sub_cats)

100%|██████████| 115/115 [00:08<00:00, 12.86it/s]


### Sentence-Level Annotation

In [6]:
unique_cats.keys()

dict_keys(['Third Party Sharing/Collection', 'Other', 'First Party Collection/Use', 'User Choice/Control', 'Policy Change', 'Data Retention', 'User Access, Edit and Deletion', 'Data Security', 'International and Specific Audiences', 'Do Not Track'])

In [7]:
n_gdpr = 7
opp_to_gdpr = {
    "Other": [],
    "First Party Collection/Use": [0, 1, 2],
    "Third Party Sharing/Collection": [0, 1, 2],
    "Data Security": [5],
    "Policy Change": [0],
    "User Choice/Control": [0],
    "Data Retention": [4],
    "International and Specific Audiences": [0],
    "User Access, Edit and Deletion": [0, 3],
    "Do Not Track": []
}

In [8]:
def preprocess(s: str) -> str:
    s = re.sub(r"www\.\w+\.((com)|(net))", "", s)  # remove links
    s = re.sub(r"[^a-zA-Z0-9_\.]+", " ", s)  # remove spaces and \t\n\s
    return s

preprocess("123 123     one one. tow tow. plus \t \t kek www.sus.net")

'123 123 one one. tow tow. plus kek '

In [15]:
# Define a specific GDPR principle index, for example, index 4 for "storage limitation"
principle_index = 4

data = []
for name, lookup_dict in tqdm.tqdm(paths_map.items()):
    # Read policy text
    with open(lookup_dict["html"], "r") as f:
        html_content = f.read()
    soup = BeautifulSoup(html_content, "html.parser")
    text = soup.get_text()
    text = preprocess(s=text)
    
    # Construct starts and ends of sentences
    sentence_borders = []
    prev_start = 0
    for match in re.finditer(r"\.", text):
        _, end = match.span()
        sentence_borders.append((prev_start, end))
        prev_start = end
    if len(text) != prev_start:
        sentence_borders.append((prev_start, len(text)))
    
    # Create holder for sentence labels 
    sentence_labels = [0 for _ in range(len(sentence_borders))]
        
    # Read annotations for policy
    annot_df = pd.read_csv(lookup_dict["annotation"], header=None)
    for cat, annot in annot_df[[5, 6]].values:
        annot_dict = json.loads(annot)
        for segment in annot_dict.values():
            if "selectedText" not in segment:
                continue
            segment_html = segment["selectedText"]
            if segment_html in ("Not selected", "null"):
                continue
            segment_soup = BeautifulSoup(segment_html, "html.parser")
            segment_text = segment_soup.get_text()
            segment_text = preprocess(s=segment_text)
            
            # Find the segment in the original policy text
            segment_start = text.find(segment_text)
            if segment_start == -1:
                continue
            segment_end = segment_start + len(segment_text)
            for sentence_id, (sentence_start, sentence_end) in enumerate(sentence_borders):
                if segment_end > sentence_start and sentence_end > segment_start:
                    if principle_index in opp_to_gdpr[cat]:
                        sentence_labels[sentence_id] = 1  # This sentence relates to the specific principle
    
    # Construct sentence-level information
    for sentence_id, ((sent_start, sent_end), label) in enumerate(zip(sentence_borders, sentence_labels)):
        data.append({
            "policy_name": name,
            "sentence_id": sentence_id,
            "sentence_text": text[sent_start:sent_end],
            "sentence_label": label  # Note this is now a single integer, not an array
        })

  0%|          | 0/115 [00:00<?, ?it/s]

100%|██████████| 115/115 [00:07<00:00, 16.18it/s]


### Dump

In [29]:
df = pd.DataFrame.from_records(data)
df["sentence_length"] = df["sentence_text"].apply(lambda s: len(s.split(" ")))
df["is_included"] = df["sentence_length"] >= 11
df

Unnamed: 0,policy_name,sentence_id,sentence_text,sentence_label,sentence_length,is_included
0,898_uptodate.com,0,UpToDate online privacy policy UpToDate Inc.,0,6,False
1,898_uptodate.com,1,is very sensitive to the privacy needs of its...,0,25,True
2,898_uptodate.com,2,UpToDate does not sell or otherwise share sub...,0,16,True
3,898_uptodate.com,3,To better understand UpToDate s online privac...,0,15,True
4,898_uptodate.com,4,Subscriber Information UpToDate never automat...,0,22,True
...,...,...,...,...,...,...
12274,414_washingtonian.com,102,Effective Date This Privacy Policy is effecti...,0,13,True
12275,701_tangeroutlet.com,0,Privacy Policy TangerOutlets is committed to k...,0,11,True
12276,701_tangeroutlet.com,1,Any and all personal identifiable information...,0,28,True
12277,701_tangeroutlet.com,2,If at any time you want your email informatio...,0,23,True


In [43]:
df.iloc[11000]["sentence_text"]

' We may also ask your permission to allow us to publicly post some of your information on the Website such as on a winner s page in the event you win a contest or sweepstakes.'

In [32]:
# Count the number of rows where 'sentence_label' is 1
count_label_1 = df['sentence_label'].sum()

# Print the result
print(f"Number of sentences that adhere to the specific GDPR principle: {count_label_1}")


Number of sentences that adhere to the specific GDPR principle: 349


In [19]:
df.to_csv("./sentences_gdpr_label_4.csv", index=False)