In [1]:
import os
from bs4 import BeautifulSoup
import numpy as np
import re

### Exract content from HTML documents

In [2]:

def convert_to_text(src_dir, target_dir):
    for file in os.listdir('../data/original_policies'):
        with open('../data/original_policies' + '/' + file, 'r', encoding="ISO-8859-1") as f:
            print(file)
            data = f.read()
            # print(data)
            bs = BeautifulSoup(data,'html.parser')
            texts = bs.findAll(['title', 'body','p','strong'])

        with open('../data/clean_policies' + '/' + file, 'w') as f:
            for t in texts:
                f.write(t.text)

### Remove tags from the documents

In [3]:
def convert_clean_summaries(src_dir, target_dir):
    for file in os.listdir('../data/sanitized_policies'):
        with open('../data/sanitized_policies' + '/' + file, 'r', encoding="ISO-8859-1") as f:

            cleanr = re.compile('<.*?>')
            cleantext = re.sub(cleanr, '', f.read())
            filename = file.split('.', -1)[0] + '.txt'

        with open('../data/notags_policies' + '/' + filename, 'w') as f:
            f.write(cleantext)



### Remove Punctions from data

In [5]:
# remove all the punctuations from the text
def remove_punctuation(data):
    data = re.sub("_", "", data)
    data = re.sub("[^\w\s]", "", data)
    data = re.sub(' +', ' ', data)

    return data

In [7]:
### Insert Code for creating training dict

### Identify Topics for each category of Privacy Policy

In [18]:
import spacy
from collections import defaultdict


nlp = spacy.load('en_core_web_sm')

In [11]:
import json
with open('../data/notags_policies/preprocessed_train/parsed_policies.txt') as f :
    labeled_data = json.loads(f.read())

In [69]:
def identify_topics(data):
    
    labeled_sentences = {}
    
    # Create dictionary with sentences and category:
    for key in data:
        for sentences in data[key]:
            labeled_sentences[sentences] = data[key][sentences]  
    
    # Regroup dictionary to merge sentences according to category

    category_sentences = defaultdict(list)

    for key, value in sorted(labeled_sentences.items()):
        category_sentences[value].append(key)
    
    
    most_common_topics = get_common_topics(category_sentences)
    
    with open('../results/policy_topics.txt', 'a') as file:
        
        for key, value in most_common_topics.items():
            file.write('Most common topics for category ' + str(key) + '\n')
            file.write(str([word for word,_ in value]) + '\n\n')
    return most_common_topics


In [70]:
from collections import Counter
def get_common_topics(category_dict):
    all_topics = {}
    for category in category_dict:
        bag_of_words = {}
        doc = nlp(remove_punctuation(' '.join(category_dict[category])))
        for token in doc:
            if not token.is_stop:
                if token.text in bag_of_words:
                    bag_of_words[token.text] += 1
                else:
                    bag_of_words[token.text] = 1
        top_topics = Counter(bag_of_words).most_common(30)
        all_topics[category] = top_topics
    return all_topics
        

In [71]:
identify_topics(labeled_data)

{'First Party Collection/Use': [('information', 1160),
  ('services', 735),
  ('use', 641),
  ('provide', 433),
  ('products', 373),
  ('Services', 365),
  ('site', 333),
  ('content', 306),
  ('users', 254),
  ('Site', 236),
  ('improve', 233),
  ('purposes', 232),
  ('service', 230),
  ('collect', 219),
  ('help', 210),
  ('online', 207),
  ('experience', 205),
  ('email', 200),
  ('features', 195),
  ('account', 191),
  ('order', 181),
  ('send', 181),
  ('advertising', 175),
  ('certain', 174),
  ('Sites', 160),
  ('sites', 154),
  ('user', 152),
  ('address', 151),
  ('cookies', 147),
  ('including', 147)],
 'User Access, Edit and Deletion': [('information', 327),
  ('account', 107),
  ('personal', 83),
  ('update', 44),
  ('email', 40),
  ('request', 36),
  ('personally', 35),
  ('provided', 34),
  ('contact', 34),
  ('identifiable', 33),
  ('Information', 32),
  ('time', 32),
  ('preferences', 30),
  ('access', 30),
  ('delete', 28),
  ('profile', 27),
  ('changes', 25),
  ('cha