In [None]:
gdpr_article_5_text = """
                      1. Personal data shall be:
                      a) processed lawfully, fairly and in a transparent manner in relation to the data subject (‘lawfulness, fairness and transparency’);
                      b) collected for specified, explicit and legitimate purposes and not further processed in a manner that is incompatible with those purposes; further processing for archiving purposes in the public interest, scientific or historical research purposes or statistical purposes shall, in accordance with Article 89(1), not be considered to be incompatible with the initial purposes (‘purpose limitation’);
                      c) adequate, relevant and limited to what is necessary in relation to the purposes for which they are processed (‘data minimisation’);
                      d) accurate and, where necessary, kept up to date; every reasonable step must be taken to ensure that personal data that are inaccurate, having regard to the purposes for which they are processed, are erased or rectified without delay (‘accuracy’);
                      e) kept in a form which permits identification of data subjects for no longer than is necessary for the purposes for which the personal data are processed; personal data may be stored for longer periods insofar as the personal data will be processed solely for archiving purposes in the public interest, scientific or historical research purposes or statistical purposes in accordance with Article 89(1) subject to implementation of the appropriate technical and organisational measures required by this Regulation in order to safeguard the rights and freedoms of the data subject (‘storage limitation’);
                      f) processed in a manner that ensures appropriate security of the personal data, including protection against unauthorised or unlawful processing and against accidental loss, destruction or damage, using appropriate technical or organisational measures (‘integrity and confidentiality’).
                      2. The controller shall be responsible for, and be able to demonstrate compliance with, paragraph 1 (‘accountability’).
                      """

In [None]:
!unzip /content/acl-coling-2014-corpus.zip -d /content/acl_corpus

Archive:  /content/acl-coling-2014-corpus.zip
   creating: /content/acl_corpus/corpus/
  inflating: /content/acl_corpus/corpus/9gag.xml  
   creating: /content/acl_corpus/__MACOSX/
   creating: /content/acl_corpus/__MACOSX/corpus/
  inflating: /content/acl_corpus/__MACOSX/corpus/._9gag.xml  
  inflating: /content/acl_corpus/corpus/about_abc_net_au.xml  
  inflating: /content/acl_corpus/__MACOSX/corpus/._about_abc_net_au.xml  
  inflating: /content/acl_corpus/corpus/about_ask.xml  
  inflating: /content/acl_corpus/__MACOSX/corpus/._about_ask.xml  
  inflating: /content/acl_corpus/corpus/about_deviantart.xml  
  inflating: /content/acl_corpus/__MACOSX/corpus/._about_deviantart.xml  
  inflating: /content/acl_corpus/corpus/about_officemax.xml  
  inflating: /content/acl_corpus/__MACOSX/corpus/._about_officemax.xml  
  inflating: /content/acl_corpus/corpus/about_pinterest.xml  
  inflating: /content/acl_corpus/__MACOSX/corpus/._about_pinterest.xml  
  inflating: /content/acl_corpus/corpus/

In [None]:
import xml.etree.ElementTree as ET
import spacy
import os


nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    # Lowercase and replace newline characters
    text = text.lower().replace('\n', ' ')
    text = ' '.join(text.split())

    # Use spaCy for further processing
    doc = nlp(text)
    clean_text = ' '.join([token.lemma_ for token in doc if not token.is_punct])

    return clean_text

preprocessed_policies = []
directory = './acl_corpus/corpus/'

for filename in os.listdir(directory):
    if filename.endswith('.xml'):
        path = os.path.join(directory, filename)
        tree = ET.parse(path)
        root = tree.getroot()

        # Extracting text from each SECTION/SUBTEXT
        for section in root.findall('.//SECTION/SUBTEXT'):
            if section.text:
                processed_text = preprocess_text(section.text)
                preprocessed_policies.append(processed_text)


In [None]:
!pip3 install nltk

In [None]:
import gensim
import nltk
from gensim import corpora
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

docs = preprocessed_policies[:100]
nltk.download('stopwords')
nltk.download('wordnet')

stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(document):
    stop_free = " ".join([word for word in document.lower().split() if word not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in docs]

dictionary = corpora.Dictionary(doc_clean)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

ldamodel = gensim.models.ldamodel.LdaModel(doc_term_matrix, num_topics=7, id2word = dictionary, passes=50)

print(ldamodel.print_topics(num_topics=7, num_words=7))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[(0, '0.016*"send" + 0.016*"email" + 0.016*"statement" + 0.015*"member" + 0.015*"w3c" + 0.014*"information" + 0.011*"alternet"'), (1, '0.043*"information" + 0.014*"policy" + 0.014*"time" + 0.014*"personal" + 0.014*"may" + 0.014*"privacy" + 0.013*"provide"'), (2, '0.012*"may" + 0.011*"privacy" + 0.008*"information" + 0.008*"personal" + 0.007*"outside" + 0.007*"eea" + 0.007*"policy"'), (3, '0.036*"web" + 0.035*"site" + 0.035*"cookie" + 0.033*"information" + 0.032*"use" + 0.019*"may" + 0.016*"visit"'), (4, '0.030*"policy" + 0.028*"future" + 0.028*"website" + 0.026*"privacy" + 0.025*"central" + 0.022*"information" + 0.020*"biome"'), (5, '0.070*"information" + 0.018*"user" + 0.018*"privacy" + 0.017*"may" + 0.016*"use" + 0.015*"collect" + 0.015*"personal"'), (6, '0.049*"information" + 0.030*"service" + 0.026*"may" + 0.022*"use" + 0.014*"provide" + 0.013*"party" + 0.013*"third"')]


In [None]:
topics = ldamodel.print_topics(num_topics=7, num_words=15)

for idx, topic in enumerate(topics):
    print("Topic #" + str(idx+1) + ":")
    print(topic)
    print("\n")


Topic #1:
(0, '0.016*"send" + 0.016*"email" + 0.016*"statement" + 0.015*"member" + 0.015*"w3c" + 0.014*"information" + 0.011*"alternet" + 0.010*"site" + 0.010*"research" + 0.010*"request" + 0.009*"interaction" + 0.009*"list" + 0.009*"contact" + 0.008*"purpose" + 0.008*"service"')


Topic #2:
(1, '0.043*"information" + 0.014*"policy" + 0.014*"time" + 0.014*"personal" + 0.014*"may" + 0.014*"privacy" + 0.013*"provide" + 0.013*"service" + 0.012*"use" + 0.011*"party" + 0.011*"irish" + 0.011*"website" + 0.011*"third" + 0.009*"share" + 0.009*"email"')


Topic #3:
(2, '0.012*"may" + 0.011*"privacy" + 0.008*"information" + 0.008*"personal" + 0.007*"outside" + 0.007*"eea" + 0.007*"policy" + 0.006*"detail" + 0.005*"keep" + 0.005*"store" + 0.004*"contact" + 0.004*"process" + 0.004*"concern" + 0.004*"treat" + 0.004*"operate"')


Topic #4:
(3, '0.036*"web" + 0.035*"site" + 0.035*"cookie" + 0.033*"information" + 0.032*"use" + 0.019*"may" + 0.016*"visit" + 0.016*"computer" + 0.016*"log" + 0.014*"addre

In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/163.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3/163.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-no

In [None]:
!unzip /content/saved_model.zip -d /content/saved_model

Archive:  /content/saved_model.zip
   creating: /content/saved_model/saved_model/
  inflating: /content/saved_model/saved_model/special_tokens_map.json  
  inflating: /content/saved_model/saved_model/model.safetensors  
  inflating: /content/saved_model/saved_model/tokenizer_config.json  
  inflating: /content/saved_model/saved_model/vocab.txt  
  inflating: /content/saved_model/saved_model/config.json  


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

# Load the model and tokenizer
model = BertForSequenceClassification.from_pretrained('./saved_model/saved_model')
tokenizer = BertTokenizer.from_pretrained('./saved_model/saved_model')

model.config.output_hidden_states = True


In [None]:
import torch
import numpy as np

def get_bert_embeddings(texts, model, tokenizer, max_length=512):
    model.eval()
    embeddings = []

    with torch.no_grad():
        for text in texts:
            inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=max_length, add_special_tokens=True)
            outputs = model(**inputs)
            hidden_states = outputs.hidden_states[-2]  # Use the second-to-last layer
            text_embedding = torch.mean(hidden_states, dim=1)  # Mean pooling
            embeddings.append(text_embedding.cpu().numpy())

    # Since each embedding is now properly padded, they can be directly stacked
    return np.vstack(embeddings)


In [None]:
# Encode texts
policy_embeddings = get_bert_embeddings(preprocessed_policies, model, tokenizer)
gdpr_embeddings = get_bert_embeddings([gdpr_article_5_text], model, tokenizer)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Calculate semantic similarity (higher values mean more similarity)
similarities = cosine_similarity(policy_embeddings, gdpr_embeddings.reshape(1, -1))
similar_policies_indices = np.where(similarities > 0.9)[0]

print(f"Number of policies potentially aligning with GDPR Article 5: {len(similar_policies_indices)}")


Number of policies potentially aligning with GDPR Article 5: 273


In [None]:
print(len(preprocessed_policies))

10501


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Calculate semantic similarity (higher values mean more similarity)
similarities = cosine_similarity(policy_embeddings, gdpr_embeddings.reshape(1, -1))
similar_policies_indices = np.where(similarities > 0.9)[0]

print(f"Number of policies potentially aligning with GDPR Article 5: {(len(similar_policies_indices)/len(preprocessed_policies) * 100):.2f}%")


Number of policies potentially aligning with GDPR Article 5: 2.60%


In [None]:
aligned_policies = [preprocessed_policies[idx] for idx in similar_policies_indices]

print(len(aligned_policies))

for policy in aligned_policies:
    print(policy)
    print("-------------------------")

273
the company share your information with any third party without obtain the prior consent of the user in the following limit circumstance a when it be request or require by law or by any court or governmental agency or authority to disclose for the purpose of verification of identity or for the prevention detection investigation include cyber incident or for prosecution and punishment of offence these disclosure be make in good faith and belief that such disclosure be reasonably necessary for enforce these term or for comply with the applicable law and regulation b the company propose to share such information within its group company and officer and employee of such group company for the purpose of process personal information on its behalf we also ensure that these recipient of such information agree to process such information base on our instruction and in compliance with this privacy policy and any other appropriate confidentiality and security measure c the company may present

- **Embedding Models:** Other than *SBERT*, I can try other models like *GPT-3* for context-aware embeddings, *FastText* for handling out-of-vocabulary words, or domain-specific models that might be trained on legal or policy-related corpora.

- **Similarity Measures:** Beyond *cosine similarity*, explore other measures like *Euclidean distance* or *Manhattan distance* for comparing embeddings.

- **Clustering Techniques:** Apply unsupervised clustering (e.g., *K-means, DBSCAN*) to group policies based on their embeddings and analyze clusters for GDPR compliance themes.

- **Dimensionality Reduction:** Use techniques like *PCA* or *t-SNE* to reduce the dimensionality of your embeddings before applying similarity measures or clustering, which might reveal different patterns.

- **Threshold Tuning:** Experiment with different *thresholds* for considering a policy potentially compliant based on similarity scores.

- **Text Preprocessing Variations:** Test the impact of different *preprocessing steps*, such as including/excluding stopwords, using stemming versus lemmatization, or experimenting with n-grams.

- **Segmentation Strategies:** Instead of analyzing entire documents, try segmenting policies into smaller *sections or paragraphs* and assess their individual compliance.