In [None]:
!unzip ./OPP-115_v1_0.zip -d ./OPP-115

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: ./OPP-115/OPP-115/original_policies/1673_www.tulsaworld.com_files/plusone.js  
  inflating: ./OPP-115/OPP-115/original_policies/1673_www.tulsaworld.com_files/postmessageRelay.html  
  inflating: ./OPP-115/OPP-115/original_policies/1673_www.tulsaworld.com_files/prum.min.js  
  inflating: ./OPP-115/OPP-115/original_policies/1673_www.tulsaworld.com_files/pubads_impl_65.js  
  inflating: ./OPP-115/OPP-115/original_policies/1673_www.tulsaworld.com_files/quant.js  
 extracting: ./OPP-115/OPP-115/original_policies/1673_www.tulsaworld.com_files/rd_log  
  inflating: ./OPP-115/OPP-115/original_policies/1673_www.tulsaworld.com_files/redirect  
  inflating: ./OPP-115/OPP-115/original_policies/1673_www.tulsaworld.com_files/require.min.js  
  inflating: ./OPP-115/OPP-115/original_policies/1673_www.tulsaworld.com_files/rt=ifr(1).html  
  inflating: ./OPP-115/OPP-115/original_policies/1673_www.tulsaworld.com_files/rt=ifr.ht

In [None]:
!pip install spacy beautifulsoup4
# !pip -m spacy download en_core_web_sm




In [None]:
import spacy
import os
from bs4 import BeautifulSoup

nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    soup = BeautifulSoup(text, "html.parser")
    clean_text = soup.get_text(separator=" ")

    doc = nlp(clean_text)

    # Use lemmatization and lowercasing
    tokens = [token.lemma_.lower() for token in doc if not token.is_punct]

    clean_text = " ".join(tokens)
    return clean_text


preprocessed_policies_dict = {}
directory = './OPP-115/OPP-115/sanitized_policies'

for filename in os.listdir(directory):
    if filename.endswith('.html'):
        base_name = '_'.join(filename.split('_')[1:]).rsplit('.', 1)[0]
        path = os.path.join(directory, filename)
        with open(path, 'r', encoding='utf-8') as file:
            html_content = file.read()
            preprocessed_text = preprocess_text(html_content)
            preprocessed_policies_dict[base_name] = preprocessed_text


In [None]:
import pandas as pd
import os

annotations_dir = './OPP-115/OPP-115/annotations/'

# A dictionary to hold all dataframes, key will be the base filename
dataframes = {}

for annotation_filename in os.listdir(annotations_dir):
    if annotation_filename.endswith('.csv'):
        base_name = '_'.join(annotation_filename.split('_')[1:]).rsplit('.', 1)[0]

        annotation_path = os.path.join(annotations_dir, annotation_filename)

        annotation_df = pd.read_csv(annotation_path, header=None)

        dataframes[base_name] = {'categories': annotation_df[5], 'preprocessed_text': preprocessed_policies_dict.get(base_name)}


yahoo_df = dataframes['yahoo.com']
yahoo_df['categories']


0              Other
1              Other
2              Other
3              Other
4              Other
           ...      
162            Other
163    Policy Change
164            Other
165            Other
166            Other
Name: 5, Length: 167, dtype: object

In [None]:
from collections import defaultdict
import pandas as pd

# Initialize a list to hold the final consolidated dataset
final_dataset = []

for base_name, data in dataframes.items():
    unique_categories_per_text = defaultdict(set)

    for category in data['categories']:
        unique_categories_per_text[data['preprocessed_text']].add(category)

    for text, categories_set in unique_categories_per_text.items():
        final_dataset.append({
            'source': base_name,
            'preprocessed_text': text,
            'categories': list(categories_set)
        })

final_df = pd.DataFrame(final_dataset)

# Display the shape of the DataFrame and the first few rows to verify the structure
print(final_df.shape)
final_df.head()


(115, 3)


Unnamed: 0,source,preprocessed_text,categories
0,dailyillini.com,privacy policy our commitment to pr...,"[First Party Collection/Use, User Choice/Contr..."
1,taylorswift.com,privacy policy last update and effect...,"[First Party Collection/Use, Policy Change, Th..."
2,instagram.com,privacy policy ||| important update ...,"[First Party Collection/Use, Policy Change, Us..."
3,fortune.com,privacy policy this be the privacy po...,"[First Party Collection/Use, Policy Change, Us..."
4,voxmedia.com,vox media privacy policy ||| update as o...,"[First Party Collection/Use, Policy Change, Th..."


In [None]:
final_df.iloc[0]['preprocessed_text']

"  privacy policy          our commitment to privacy        your privacy be important to we to well protect your privacy we provide this notice explain our online information practice and the choice you can make about the way your information be collect and use     |||   the information we collect        this notice apply to all information collect or submit on any of illini media 's website include dailyillini.com on some of our page you may have the option to create an account make request register for contest and/or submit content to our website     |||the personal information collect at these page be use to identify you as a user of the site to track your usage of interactive feature and to communicate with you regard site news and update you can opt out of all newsletter but you may still receive occasional notice regard your specific usage of the site we do not share this information with outside party unless otherwise note at the time of datum entry     |||illini media use non i

In [None]:
# First Party Collection/Use	- maps to -> Lawfulness, Fairness, Transparency | Purpose Limitation | Data Minimization

# Third Party Sharing/Collection - maps to ->	Lawfulness, Fairness, Transparency | Purpose Limitation | Data Minimization

# User Choice/Control	 - maps to -> Lawfulness, Fairness, Transparency

# User Access, Edit, and Deletion  - maps to ->	Lawfulness, Fairness, Transparency | Accuracy

# Data Retention  - maps to ->	Storage Limitation

# Data Security  - maps to -> Integrity and Confidentiality

# Policy Change	  - maps to ->	 Lawfulness, Fairness, Transparency

# Do Not Track

# International and Specific Audiences	  - maps to ->	 Lawfulness, Fairness, Transparency

# Other	(No Direct Mapping)

label_mapping = {
    "First Party Collection/Use": [1, 2, 3],
    "Third Party Sharing/Collection": [1, 2, 3],
    "User Choice/Control": [1],
    "User Access, Edit, and Deletion": [1, 4],
    "Data Retention": [5],
    "Data Security": [6],
    "Policy Change": [1],
    "Do Not Track": [],
    "International and Specific Audiences": [1],
    "Other": []
}

# where:

# 1 is lawfulness, fairness and transparency

# 2 is purpose limitation

# 3 is data minimization

# 4 is accuracy

# 5 is storage limitation

# 6 is integrity and confidentiality

# 7 is accountability

def transform_document_labels(document_labels):
    document_new_labels_set = set()
    for label in document_labels:
        # Add the new categories to the set if the label is in the mapping
        if label in label_mapping:
            document_new_labels_set.update(label_mapping[label])
    return sorted(list(document_new_labels_set))

final_df['gdpr_principles'] = final_df['categories'].apply(transform_document_labels)

total_principles = 7

final_df['compliance_percentage'] = final_df['gdpr_principles'].apply(lambda x: (len(x) / total_principles) * 100)

final_df.head()



Unnamed: 0,source,preprocessed_text,categories,gdpr_principles,compliance_percentage
0,dailyillini.com,privacy policy our commitment to pr...,"[First Party Collection/Use, User Choice/Contr...","[1, 2, 3, 6]",57.142857
1,taylorswift.com,privacy policy last update and effect...,"[First Party Collection/Use, Policy Change, Th...","[1, 2, 3, 5, 6]",71.428571
2,instagram.com,privacy policy ||| important update ...,"[First Party Collection/Use, Policy Change, Us...","[1, 2, 3, 5, 6]",71.428571
3,fortune.com,privacy policy this be the privacy po...,"[First Party Collection/Use, Policy Change, Us...","[1, 2, 3, 5, 6]",71.428571
4,voxmedia.com,vox media privacy policy ||| update as o...,"[First Party Collection/Use, Policy Change, Th...","[1, 2, 3, 5, 6]",71.428571


In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

encoded_labels = mlb.fit_transform(final_df['gdpr_principles'])


In [None]:
preprocessed_policies = final_df['preprocessed_text'].tolist()


In [None]:
!pip install -U sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/171.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/171.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (1

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import InputExample

class CustomDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Convert labels to a tensor directly suitable for multi-label classification
labels_tensor = torch.tensor(encoded_labels, dtype=torch.float)

# Create the dataset
dataset = CustomDataset(preprocessed_policies, labels_tensor)

# Since we cannot use InputExample directly with encoded multi-labels, we skip it and handle batches manually
def collate_fn(batch):
    texts, labels = zip(*batch)
    return texts, torch.stack(labels)  # Ensure labels are properly formatted as tensors

dataloader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)


In [15]:
from sentence_transformers import SentenceTransformer, losses
from torch.optim import Adam
import torch
from sklearn.metrics import f1_score, precision_score, recall_score
from torch.utils.data import random_split

model = SentenceTransformer('all-MiniLM-L6-v2')
num_labels = len(mlb.classes_)

classifier = torch.nn.Linear(model.get_sentence_embedding_dimension(), num_labels)  # num_labels is the number of target classes
optimizer = Adam(list(model.parameters()) + list(classifier.parameters()), lr=1e-5)
loss_fn = torch.nn.BCEWithLogitsLoss()  # Suitable for multi-label classification

# Split dataset into training and validation
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

def evaluate_model(model, data_loader):
    model.eval()  # Set model to evaluation mode
    val_loss = 0
    true_labels = []
    pred_labels = []
    with torch.no_grad():
        for texts, labels in data_loader:
            embeddings = model.encode(texts, convert_to_tensor=True)
            predictions = classifier(embeddings)
            loss = loss_fn(predictions, labels)
            val_loss += loss.item()

            # Convert predictions to binary labels
            predicted_labels = (torch.sigmoid(predictions) > 0.5).int()
            true_labels.append(labels.int())
            pred_labels.append(predicted_labels)

    true_labels = torch.cat(true_labels).cpu().numpy()
    pred_labels = torch.cat(pred_labels).cpu().numpy()
    f1 = f1_score(true_labels, pred_labels, average='samples')  # 'samples' for multilabel classification
    precision = precision_score(true_labels, pred_labels, average='samples')
    recall = recall_score(true_labels, pred_labels, average='samples')
    exact_match = (true_labels == pred_labels).all(axis=1).mean()

    return val_loss / len(data_loader), f1, precision, recall, exact_match

# Training loop
for epoch in range(50):
    model.train()  # Set model to training mode
    total_loss = 0
    for texts, labels in train_loader:
        optimizer.zero_grad()
        embeddings = model.encode(texts, convert_to_tensor=True)
        predictions = classifier(embeddings)
        loss = loss_fn(predictions, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    val_loss, f1, precision, recall, exact_match = evaluate_model(model, val_loader)
    print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss}, Validation Loss: {val_loss}, F1: {f1}, Precision: {precision}, Recall: {recall}, Exact Match: {exact_match}")


model.save('./saved_model')


Epoch 1, Training Loss: 0.6985414524873098, Validation Loss: 0.6997028589248657, F1: 0.42701863354037284, Precision: 1.0, Recall: 0.2833333333333334, Exact Match: 0.0
Epoch 2, Training Loss: 0.6982496778170267, Validation Loss: 0.6994301974773407, F1: 0.42701863354037284, Precision: 1.0, Recall: 0.2833333333333334, Exact Match: 0.0
Epoch 3, Training Loss: 0.6977572937806448, Validation Loss: 0.6991569697856903, F1: 0.42701863354037284, Precision: 1.0, Recall: 0.2833333333333334, Exact Match: 0.0
Epoch 4, Training Loss: 0.6975514590740204, Validation Loss: 0.6988854110240936, F1: 0.42701863354037284, Precision: 1.0, Recall: 0.2833333333333334, Exact Match: 0.0
Epoch 5, Training Loss: 0.6972384055455526, Validation Loss: 0.6986139118671417, F1: 0.42701863354037284, Precision: 1.0, Recall: 0.2833333333333334, Exact Match: 0.0
Epoch 6, Training Loss: 0.6969164411226908, Validation Loss: 0.698340892791748, F1: 0.42701863354037284, Precision: 1.0, Recall: 0.2833333333333334, Exact Match: 0.0

In [16]:
model.save('./saved_model')

!zip -r saved_model.zip ./saved_model/

from google.colab import files
files.download('saved_model.zip')


  adding: saved_model/ (stored 0%)
  adding: saved_model/vocab.txt (deflated 53%)
  adding: saved_model/tokenizer_config.json (deflated 74%)
  adding: saved_model/sentence_bert_config.json (deflated 4%)
  adding: saved_model/tokenizer.json (deflated 71%)
  adding: saved_model/2_Normalize/ (stored 0%)
  adding: saved_model/config.json (deflated 47%)
  adding: saved_model/modules.json (deflated 62%)
  adding: saved_model/config_sentence_transformers.json (deflated 31%)
  adding: saved_model/1_Pooling/ (stored 0%)
  adding: saved_model/1_Pooling/config.json (deflated 57%)
  adding: saved_model/README.md (deflated 64%)
  adding: saved_model/model.safetensors (deflated 9%)
  adding: saved_model/special_tokens_map.json (deflated 80%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!zip -r logs.zip logs/
!zip -r results.zip results/



zip error: Nothing to do! (try: zip -r logs.zip . -i logs/)

zip error: Nothing to do! (try: zip -r results.zip . -i results/)
