In [1]:

import os
from bs4 import BeautifulSoup
import re

## Data cleaning and Preprocessing

In [2]:
# Convert HTML documents to text

def convert_to_text(src_dir, target_dir):
    for file in os.listdir('../data/original_policies'):
        with open('../data/original_policies' + '/' + file, 'r', encoding="ISO-8859-1") as f:
            print(file)
            data = f.read()
            # print(data)
            bs = BeautifulSoup(data,'html.parser')
            texts = bs.findAll(['title', 'body','p','strong'])

        with open('../data/clean_policies' + '/' + file, 'w') as f:
            for t in texts:
                f.write(t.text)


In [3]:
# Remove tags from the documents
def convert_clean_summaries(src_dir, target_dir):
    for file in os.listdir('../data/sanitized_policies'):
        with open('../data/sanitized_policies' + '/' + file, 'r', encoding="ISO-8859-1") as f:

            cleanr = re.compile('<.*?>')
            cleantext = re.sub(cleanr, '', f.read())
            filename = file.split('.', -1)[0] + '.txt'

        with open('../data/notags_policies' + '/' + filename, 'w') as f:
            f.write(cleantext)


In [4]:
# remove all the punctuations from the text
def remove_punctuation(data):
    data = re.sub("_", "", data)
    data = re.sub("[^\w\s]", "", data)
    data = re.sub(' +', ' ', data)

    return data




In [6]:
import spacy
import os
import numpy as np
import scipy.spatial.distance as distance
import gensim
from collections import defaultdict
import re
from src.preprocess import remove_punctuation
from sklearn.neighbors import KNeighborsClassifier
import math


word2vec = gensim.models.KeyedVectors.load_word2vec_format(
    "../data/GoogleNews-vectors-negative300.bin", binary=True)
nlp = spacy.load('en_core_web_sm')

In [None]:
para2vec = gensim.models.KeyedVectors.load_doc2vec_format

In [23]:
CLASS_NUM = {'Data Retention' : 0,
 'Data Security' : 1,
 'Do Not Track' : 2,
 'First Party Collection/Use' : 3,
 'International and Specific Audiences' : 4,
 'Policy Change' : 5,
 'Third Party Sharing/Collection' : 6,
 'User Access, Edit and Deletion' : 7,
 'User Choice/Control' : 8,
 'Other' : 9}

## Word Weights

In [7]:
def word_weights(filepath):

    with open(filepath) as f:
        content = f.read()



## Create Vectors

In [10]:
def get_sentence_vector(sentence):
    print("Geting maximum sentence similarity of each sentence...")
    word2vec_dict = word2vec.vocab.keys()

    wordvecs = []
    # sentence = nlp(sentence)
    for word in sentence:
        # Words in a summary that are not covered by Word2Vec are discarded.
        word = str(word).lower()
        if word in word2vec_dict:
            wordvecs.append(word2vec[word])

    wordvecs = np.array(wordvecs)
    # represent each sentence as average of its word embedding
    sentence_score = np.mean(wordvecs, axis=0)


    return sentence_score



In [38]:
# TODO: Modify explanation


#  Paragraph2Vec technique includes several different algorithm. 
# The only difference from word2vec is inclusion of documents along with words as input nodes. 
# P2V neural net has input nodes representing documents in the training data (see fig 2).
# The rationale behind including documents as input nodes is based upon considering documents as another context. 
# In this abstract sense of context there is no difference between a word and a document. 
# At the time of training we consider (context set, target) pairs as in word2vec, however,
# for P2V document is also considered a member of the context set.
# The objective function and the training update steps are exactly the same as word2vec.

def get_paragraph_vector(paragraph):
    word2vec_dict = word2vec.vocab.keys()
    sentences = nlp(paragraph)

    wordvecs = []
    # sentence = nlp(sentence)
    for word in sentences:
        # Words in a summary that are not covered by Word2Vec are discarded.
        word = str(word.text).lower()
        if word in word2vec_dict:
            wordvecs.append(word2vec[word])

    wordvecs = np.array(wordvecs)
    # represent each sentence as average of its word embedding
    paragraph_score = np.mean(wordvecs, axis=0)


    return paragraph_score



## Baseline Model

In [29]:
def calculate_sentence_sim(s1):
    #       - Statement 1 (Clear Purpose): For what purposes does the company use personal information?
    #     # - Statement 2 (Third Parties): Does the company share my information with third parties?
    #     # - Statement 3 (Limited Collection): Does the company combine my information with data from other sources?
    #     # - Statement 4 (Limited Use): Will the company sell, re-package or commercialize my data?
    #     # - Statement 5 (Retention): Will the company retain my data? What is their retention policy?

    s  = get_sentence_vector(s1)
    c1 = get_sentence_vector(nlp(remove_punctuation("For what purposes does the company use personal information?")))
    c2 = get_sentence_vector(nlp(remove_punctuation("Does the company share my information with third parties?")))
    c3 = get_sentence_vector(nlp(remove_punctuation("Does the company combine my information with data from other sources?")))
    c4 = get_sentence_vector(nlp(remove_punctuation("Will the company sell, re-package or commercialize my data?")))
    c5 = get_sentence_vector(nlp(remove_punctuation("Will the company retain my data? What is their retention policy?")))

    res = [0] * 5
    res[0] = 1 - distance.cosine(s, c1)
    res[1] = 1 - distance.cosine(s, c2)
    res[2] = 1 - distance.cosine(s, c3)
    res[3] = 1 - distance.cosine(s, c4)
    res[4] = 1 - distance.cosine(s, c5)

    return res

## Learning Model

In [30]:
## Get Train Data
import json

with open('../data/notags_policies/sample_train/parsed_policies.txt') as f:
    data = json.loads(f.read())



In [31]:
def get_vector_for_policy(data, filename):
    para_dict = {}
    for key in data[filename]:
        para_dict[key] = CLASS_NUM[data[filename][key]]
    
    return para_dict
        


In [73]:
training_data = get_vector_for_policy(data, "414_washingtonian.com.csv")

In [138]:
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import cross_val_score
import numpy as np
def train_single_file(train_data):
    x_data = []
    for i in train_data.keys():
        x_data.append(get_paragraph_vector(i))

    y_data = list(training_data.values())
    
    
    knn = KNeighborsClassifier()

    param_grid = {'n_neighbors': np.arange(1, 25)}

#     # Cross validation
#     knn_gscv = GridSearchCV(knn, param_grid, error_score='raise', cv =5)
    
    
    


    #create a new KNN model
    knn_cv = KNeighborsClassifier(n_neighbors=3)

    #train model with cv of 5 
    cv_scores = cross_val_score(knn_cv,  x_data, y_data, cv=5)

    #print each cv score (accuracy) and average them
    print(cv_scores)
    print('cv_scores mean:{}'.format(np.mean(cv_scores)))

    #fit model to data
    return knn_cv, x_data, y_data
#     print(knn_model.score(x_data, y_data))
    
#     return knn


In [139]:
 knn_gscv, x_data, y_data = train_single_file(training_data)

3
[0.35294118 0.375      0.28571429 0.23076923 0.2       ]
cv_scores mean:0.2888849385908209




In [137]:
knn_gscv.fit(x_data, y_data)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [129]:
knn_gscv.best_params_

{'n_neighbors': 19}

In [130]:
knn_gscv.best_score_

0.35714285714285715

## **********

In [13]:
def get_policy_vectors(filepath):

    with open(filepath,'r') as f:

        data = nlp(remove_punctuation(f.read()))
        res = {}
        for sentence in data.sents:
            sentence_sim_vector = calculate_sentence_sim(sentence)
            # if max(sentence_sim_vector) >= 0.65:
            #     print(sentence)
            res[sentence] = sentence_sim_vector

    return res



In [12]:
def get_policy_vectors_sents(filepath):

    with open(filepath,'r') as f:

        data = nlp(remove_punctuation(f.read()))
        res = {}
        for sentence in data.sents:

            sentence_sim_vector = get_sentence_vector(nlp((str(sentence.text))))

            if np.isnan(sentence_sim_vector).any():
                res[sentence] = np.array([0.0]*300)
                continue
            print(type(sentence_sim_vector))
            # if math.isnan(sentence_sim_vector):
            #     continue
            # if max(sentence_sim_vector) >= 0.65:
            #     print(sentence)
            res[sentence] = sentence_sim_vector

    return res

In [14]:
# def get_all_policy_vectors(src_path):
#     all_policies = {}
#     for file in os.listdir(src_path):
#         with open(src_path + '/'+ file, 'r') as f:
#             all_policies[file] = get_all_policy_vectors(src_path + '/'+ file)
#
#     return all_policies



similarity_array = get_policy_vectors('../data/notags_policies/33_nbcuniversal.txt')
vector_array = get_policy_vectors_sents('../data/notags_policies/33_nbcuniversal.txt')

class_array= {}
for k,row in similarity_array.items():
    if max(row) >= 0.65:
        class_array[k] = row.index(max(row)) + 1
    else:
        class_array[k] = 0





Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting max

Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting max

Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting max

Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting max

Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting maximum sentence similarity of each sentence...
Geting max

Geting maximum sentence similarity of each sentence...
<class 'numpy.ndarray'>
Geting maximum sentence similarity of each sentence...
<class 'numpy.ndarray'>
Geting maximum sentence similarity of each sentence...
<class 'numpy.ndarray'>
Geting maximum sentence similarity of each sentence...
<class 'numpy.ndarray'>
Geting maximum sentence similarity of each sentence...
<class 'numpy.ndarray'>
Geting maximum sentence similarity of each sentence...
<class 'numpy.ndarray'>
Geting maximum sentence similarity of each sentence...
<class 'numpy.ndarray'>
Geting maximum sentence similarity of each sentence...
<class 'numpy.ndarray'>
Geting maximum sentence similarity of each sentence...
<class 'numpy.ndarray'>
Geting maximum sentence similarity of each sentence...
<class 'numpy.ndarray'>
Geting maximum sentence similarity of each sentence...
<class 'numpy.ndarray'>
Geting maximum sentence similarity of each sentence...
<class 'numpy.ndarray'>
Geting maximum sentence similarity of each sentence.