In [None]:
import spacy
from sklearn.cluster import AgglomerativeClustering
import numpy as np

# Load the English NLP model
nlp = spacy.load('en_core_web_lg')

# OPP-115 Categories
opp_categories = [
    "First Party Collection/Use",
    "Third Party Sharing/Collection",
    "User Choice/Control",
    "User Access, Edit, and Deletion",
    "Data Retention",
    "Data Security",
    "Policy Change",
    "Do Not Track",
    "International and Specific Audiences",
    "Other"
]

# GDPR Article 5 Principles
gdpr_principles = [
    "processed lawfully, fairly and in a transparent manner (‘lawfulness, fairness and transparency’)",
    "collected for specified, explicit and legitimate purposes and not further processed in a manner that is incompatible with those purposes (‘purpose limitation’)",
    "adequate, relevant and limited to what is necessary (‘data minimisation’)",
    "accurate and, where necessary, kept up to date (‘accuracy’)",
    "kept in a form which permits identification of data subjects for no longer than necessary (‘storage limitation’)",
    "processed in a manner that ensures security of the data (‘integrity and confidentiality’)",
    "the controller shall be responsible for and be able to demonstrate compliance (‘accountability’)"
]

# Create embeddings for OPP-115 categories and GDPR principles
opp_embeddings = np.array([nlp(text).vector for text in opp_categories])
gdpr_embeddings = np.array([nlp(text).vector for text in gdpr_principles])

# Combine embeddings
combined_embeddings = np.vstack((opp_embeddings, gdpr_embeddings))

# Perform clustering
clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.5, affinity='cosine', linkage='complete')
clustering.fit(combined_embeddings)

# Assign clusters
clusters = clustering.labels_

# Print mappings
for i, category in enumerate(opp_categories + gdpr_principles):
    print(f"{category} is mapped to cluster {clusters[i]}")


In [None]:
import spacy
import numpy as np
from sklearn.cluster import AgglomerativeClustering
import networkx as nx
import matplotlib.pyplot as plt

# Load the English NLP model
nlp = spacy.load('en_core_web_lg')

# OPP-115 Categories
opp_categories = [
    "First Party Collection/Use",
    "Third Party Sharing/Collection",
    "User Choice/Control",
    "User Access, Edit, and Deletion",
    "Data Retention",
    "Data Security",
    "Policy Change",
    "Do Not Track",
    "International and Specific Audiences",
    "Other"
]

# GDPR Article 5 Principles
gdpr_principles = [
    "lawfulness, fairness and transparency",
    "purpose limitation",
    "data minimisation",
    "accuracy",
    "storage limitation",
    "integrity and confidentiality",
    "accountability"
]

# Embeddings
opp_embeddings = np.array([nlp(text).vector for text in opp_categories])
gdpr_embeddings = np.array([nlp(text).vector for text in gdpr_principles])

# Combine and cluster
combined_embeddings = np.vstack((opp_embeddings, gdpr_embeddings))
clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.6, affinity='cosine', linkage='complete')
clustering.fit(combined_embeddings)

# Create a graph
G = nx.Graph()

# Add nodes with labels
for text in opp_categories + gdpr_principles:
    G.add_node(text, label=text)

# Add edges based on clusters
for i in range(len(opp_categories)):
    for j in range(len(gdpr_principles)):
        if clustering.labels_[i] == clustering.labels_[len(opp_categories) + j]:
            G.add_edge(opp_categories[i], gdpr_principles[j])

# Draw the graph
plt.figure(figsize=(12, 12))
pos = nx.spring_layout(G, seed=42)  # positions for all nodes
nx.draw_networkx_nodes(G, pos, node_size=7000, node_color='skyblue')
nx.draw_networkx_edges(G, pos, width=2)
nx.draw_networkx_labels(G, pos, font_size=10)
plt.title("Mapping OPP-115 Categories to GDPR Principles")
plt.show()

# Print mappings in a readable format
cluster_map = {}
all_labels = opp_categories + gdpr_principles  # Create a single list of all labels

for i, label in enumerate(clustering.labels_):
    cluster_map.setdefault(label, []).append(all_labels[i])  # Append correct item

# Display readable mappings
for cluster, texts in cluster_map.items():
    opps = [t for t in texts if t in opp_categories]
    gdprs = [t for t in texts if t in gdpr_principles]
    if opps and gdprs:
        print(f"{' and '.join(opps)} map to {', '.join(gdprs)}")


In [None]:
!pip install transformers


In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings
def get_bert_embeddings(text):
    # Encode text to get input ids and attention mask
    encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        output = model(**encoded_input)
    # Get the embeddings from the last hidden state
    embeddings = output.last_hidden_state[:,0,:].numpy()  # Take the embeddings of the [CLS] token
    return embeddings

# OPP-115 Categories and GDPR Principles as lists of strings
opp_categories = [
    "First Party Collection/Use: how and why a service provider collects user information.",
    "Third Party Sharing/Collection: how user information may be shared with or collected by third parties.",
    "User Choice/Control: choices and control options available to users.",
    "User Access, Edit, & Deletion: if and how users may access, edit, or delete their information.",
    "Data Retention: how long user information is stored.",
    "Data Security: how user information is protected.",
    "Policy Change: if and how users will be informed about changes to the privacy policy.",
    "Do Not Track: if and how Do Not Track signals for online tracking and advertising are honored.",
    "International & Specific Audiences: practices that pertain only to a specific group of users (e.g., children, Europeans, or California residents).",
    "Other: additional sub-labels for introductory or general text, contact information, and practices not covered by the other categories."
]

gdpr_principles = [
    "Lawfulness, Fairness and Transparency: processed lawfully, fairly and in a transparent manner",
    "Purpose Limitation: collected for specified, explicit and legitimate purposes and not further processed in a manner that is incompatible with those purposes",
    "Data Minimisation: adequate, relevant and limited to what is necessary",
    "Accuracy: accurate and, where necessary, kept up to date",
    "Storage Limitation: kept in a form which permits identification of data subjects for no longer than necessary",
    "Integrity and Confidentiality: processed in a manner that ensures security of the data",
    "Accountability: the controller shall be responsible for and be able to demonstrate compliance"
]

# Get embeddings
opp_embeddings = np.vstack([get_bert_embeddings(cat) for cat in opp_categories])
gdpr_embeddings = np.vstack([get_bert_embeddings(prin) for prin in gdpr_principles])

# Compute cosine similarity between each category and principle
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(opp_embeddings, gdpr_embeddings)

# Display the top matches for each OPP category based on a similarity threshold
similarity_threshold = 0.85  # Define your own threshold here
for i, category in enumerate(opp_categories):
    print(f"\n{category[:11]} is similar to the following GDPR principles with a similarity above {similarity_threshold}:")
    for j, principle in enumerate(gdpr_principles):
        if similarity_matrix[i][j] > similarity_threshold:
            print(f"  {principle[:12]} (Similarity: {similarity_matrix[i][j]:.2f})")
