In [1]:

import os
from bs4 import BeautifulSoup
import re

## Data cleaning and Preprocessing

In [2]:
# Convert HTML documents to text

def convert_to_text(src_dir, target_dir):
    for file in os.listdir('../data/original_policies'):
        with open('../data/original_policies' + '/' + file, 'r', encoding="ISO-8859-1") as f:
            print(file)
            data = f.read()
            # print(data)
            bs = BeautifulSoup(data,'html.parser')
            texts = bs.findAll(['title', 'body','p','strong'])

        with open('../data/clean_policies' + '/' + file, 'w') as f:
            for t in texts:
                f.write(t.text)


In [3]:
# Remove tags from the documents
def convert_clean_summaries(src_dir, target_dir):
    for file in os.listdir('../data/sanitized_policies'):
        with open('../data/sanitized_policies' + '/' + file, 'r', encoding="ISO-8859-1") as f:

            cleanr = re.compile('<.*?>')
            cleantext = re.sub(cleanr, '', f.read())
            filename = file.split('.', -1)[0] + '.txt'

        with open('../data/notags_policies' + '/' + filename, 'w') as f:
            f.write(cleantext)


In [4]:
# remove all the punctuations from the text
def remove_punctuation(data):
    data = re.sub("_", "", data)
    data = re.sub("[^\w\s]", "", data)
    data = re.sub(' +', ' ', data)

    return data




In [None]:
import spacy
import os
import numpy as np
import scipy.spatial.distance as distance
import gensim
from collections import defaultdict
import re
from src.preprocess import remove_punctuation
from sklearn.neighbors import KNeighborsClassifier
import math


word2vec = gensim.models.KeyedVectors.load_word2vec_format(
    "../data/GoogleNews-vectors-negative300.bin", binary=True)
nlp = spacy.load('en_core_web_sm')

In [None]:
para2vec = gensim.models.KeyedVectors.load_doc2vec_format

In [7]:
CLASS_NUM = {'Data Retention' : 0,
 'Data Security' : 1,
 'Do Not Track' : 2,
 'First Party Collection/Use' : 3,
 'International and Specific Audiences' : 4,
 'Policy Change' : 5,
 'Third Party Sharing/Collection' : 6,
 'User Access, Edit and Deletion' : 7,
 'User Choice/Control' : 8,
 'Other' : 9}

## Word Weights

In [None]:
def word_weights(filepath):

    with open(filepath) as f:
        content = f.read()



## Create Vectors

In [None]:
def get_sentence_vector(sentence):
    print("Geting maximum sentence similarity of each sentence...")
    word2vec_dict = word2vec.vocab.keys()

    wordvecs = []
    # sentence = nlp(sentence)
    for word in sentence:
        # Words in a summary that are not covered by Word2Vec are discarded.
        word = str(word).lower()
        if word in word2vec_dict:
            wordvecs.append(word2vec[word])

    wordvecs = np.array(wordvecs)
    # represent each sentence as average of its word embedding
    sentence_score = np.mean(wordvecs, axis=0)


    return sentence_score



In [8]:
# TODO: Modify explanation


#  Paragraph2Vec technique includes several different algorithm. 
# The only difference from word2vec is inclusion of documents along with words as input nodes. 
# P2V neural net has input nodes representing documents in the training data (see fig 2).
# The rationale behind including documents as input nodes is based upon considering documents as another context. 
# In this abstract sense of context there is no difference between a word and a document. 
# At the time of training we consider (context set, target) pairs as in word2vec, however,
# for P2V document is also considered a member of the context set.
# The objective function and the training update steps are exactly the same as word2vec.

def get_paragraph_vector(paragraph):
#     print("Vectorizing Paragraph")
    word2vec_dict = word2vec.vocab.keys()
    sentences = nlp(paragraph)

    wordvecs = []
    # sentence = nlp(sentence)
    for word in sentences:
        # Words in a summary that are not covered by Word2Vec are discarded.
        word = str(word.text).lower()
        if word in word2vec_dict:
            wordvecs.append(word2vec[word])

    wordvecs = np.array(wordvecs)
    # represent each sentence as average of its word embedding
    paragraph_score = np.mean(wordvecs, axis=0)


    return paragraph_score



## Baseline Model

In [None]:
def calculate_sentence_sim(s1):
    #       - Statement 1 (Clear Purpose): For what purposes does the company use personal information?
    #     # - Statement 2 (Third Parties): Does the company share my information with third parties?
    #     # - Statement 3 (Limited Collection): Does the company combine my information with data from other sources?
    #     # - Statement 4 (Limited Use): Will the company sell, re-package or commercialize my data?
    #     # - Statement 5 (Retention): Will the company retain my data? What is their retention policy?

    s  = get_sentence_vector(s1)
    c1 = get_sentence_vector(nlp(remove_punctuation("For what purposes does the company use personal information?")))
    c2 = get_sentence_vector(nlp(remove_punctuation("Does the company share my information with third parties?")))
    c3 = get_sentence_vector(nlp(remove_punctuation("Does the company combine my information with data from other sources?")))
    c4 = get_sentence_vector(nlp(remove_punctuation("Will the company sell, re-package or commercialize my data?")))
    c5 = get_sentence_vector(nlp(remove_punctuation("Will the company retain my data? What is their retention policy?")))

    res = [0] * 5
    res[0] = 1 - distance.cosine(s, c1)
    res[1] = 1 - distance.cosine(s, c2)
    res[2] = 1 - distance.cosine(s, c3)
    res[3] = 1 - distance.cosine(s, c4)
    res[4] = 1 - distance.cosine(s, c5)

    return res

## KNeighborsClassifier  Model

In [10]:
## Get Train Data
import json

with open('../data/notags_policies/sample_train/parsed_policies.txt') as f:
    all_policies = json.loads(f.read())



In [None]:
def get_vector_for_policy(data, training_files):
    para_dict = {}
    
    for filename in training_files:
        for key in data[filename]:
            para_dict[key] = CLASS_NUM[data[filename][key]]

    return para_dict



In [18]:
def get_k_fold_cross_validation(classifier, X, Y, K):
    res = []
    scores = []
    for k in range(K):
        train_x = [x for i, x in enumerate(X) if i % K != k]
        train_y = [y for i, y in enumerate(Y) if i % K != k]

        validate_x = [x for i, x in enumerate(X) if i % K == k]
        validate_y = [y for i, y in enumerate(Y) if i % K == k]
        res.append(((train_x, train_y), (validate_x, validate_y)))

    for train, validate in res:
        classifier.fit(train[0], train[1])
        score = classifier.score(validate[0], validate[1])
        scores.append(score)

    return np.array(scores).mean()

In [None]:
with open('../data/notags_policies/sample_train/train_list.txt') as f:
    content = f.read()
content

training_files = ["414_washingtonian.com.csv", "856_sciencemag.org.csv", "70_meredith.com.csv", "1636_sidearmsports.com.csv", "1224_austincc.edu.csv", "1510_jibjab.com.csv", "453_barnesandnoble.com.csv", "1099_enthusiastnetwork.com.csv", "98_neworleansonline.com.csv", "59_liquor.com.csv", "940_internetbrands.com.csv", "883_ted.com.csv", "82_sheknows.com.csv", "394_newsbusters.org.csv", "164_adweek.com.csv", "33_nbcuniversal.com.csv", "640_gamestop.com.csv", "652_randomhouse.com.csv", "1618_sltrib.com.csv", "1713_latinpost.com.csv", "891_everydayhealth.com.csv", "105_amazon.com.csv", "303_reddit.com.csv", "58_esquire.com.csv", "591_google.com.csv", "207_reference.com.csv", "32_voxmedia.com.csv", "1050_honda.com.csv", "144_style.com.csv", "807_lodgemfg.com.csv", "1034_aol.com.csv", "1089_freep.com.csv", "1164_acbj.com.csv", "517_kaleidahealth.org.csv", "1694_lids.com.csv", "1028_redorbit.com.csv", "1419_miaminewtimes.com.csv", "1468_rockstargames.com.csv", "1683_dailynews.com.csv", "746_kraftrecipes.com.csv", "348_walmart.com.csv", "928_stlouisfed.org.csv", "21_imdb.com.csv", "320_timeinc.com.csv", "20_theatlantic.com.csv", "202_foodallergy.org.csv", "26_nytimes.com.csv", "1666_wsmv.com.csv", "1070_wnep.com.csv", "686_military.com.csv", "1539_geocaching.com.csv", "641_cbsinteractive.com.csv", "200_washingtonpost.com.csv", "135_instagram.com.csv", "1360_thehill.com.csv", "1306_chasepaymentech.com.csv", "1470_steampowered.com.csv", "186_abcnews.com.csv", "1610_post-gazette.com.csv", "1708_foxsports.com.csv", "635_playstation.com.csv", "175_mlb.mlb.com.csv", "541_ifsa-butler.org.csv", "359_vikings.com.csv", "1259_fool.com.csv", "133_fortune.com.csv", "1300_bankofamerica.com.csv", "962_lynda.com.csv", "1106_allstate.com.csv", "1582_msn.com.csv"]

In [None]:
training_data = get_vector_for_policy(all_policies, training_files)
t_data = {}
for i in training_data:
    if not np.isnan(get_paragraph_vector(i)).any():
        t_data[i] = training_data[i]

In [None]:
len(t_data)


In [None]:
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import cross_val_score
import numpy as np


def train_single_file(train_data):
    x_data = []
    for i in train_data.keys():
        x_data.append(get_paragraph_vector(i))

    y_data = list(train_data.values())
    
    
    
    scores = []
    knns = []
    for i in range(1, 10):
        knn = KNeighborsClassifier(n_neighbors = i)
        mean_score = get_k_fold_cross_validation(knn, x_data, y_data, 5)
        scores.append(mean_score)
        knns.append(knn)

    index = np.argmax(scores)
    
    print("Grid search result:")
    print(scores[index])
    print(scores)
    return knns[index], x_data, y_data 
        

In [None]:
knn_gscv, x_data, y_data = train_single_file(t_data)

In [None]:
para_test_dict = {}

for filename in all_policies.keys():
    if filename not in training_files:
        for key in all_policies[filename]:
            para_test_dict[key] = CLASS_NUM[all_policies[filename][key]]






In [None]:
y_predict = []

for i in para_test_dict.keys():

    if np.isnan(get_paragraph_vector(i)).any():
        y_predict.append(9)
    else:
        y_predict.append(knn_gscv.predict([get_paragraph_vector(i)]))

In [None]:
y_knn_predict = []
for i in y_predict:
    if isinstance(i, int):
        y_knn_predict.append(i)    
    else:
        y_knn_predict.append(i[0])
        
y_knn_predict
len(y_knn_predict)

In [None]:
data = [x_data, y_data ]

## MLPClassifier Model

In [16]:
from sklearn.neural_network import MLPClassifier

def optimize_parameters(data, noOfFirstHiddenLayer, maxIter, learningRateInit):
    nf = noOfFirstHiddenLayer
    mi = maxIter
    lr = learningRateInit
    params = [[x, y, 10 ** z]
              for x in range(nf[0], nf[1] + nf[2], nf[2])
              for y in range(mi[0], mi[1] + mi[2], mi[2])
              for z in range(int(np.log10(lr[0])),
                             int(np.log10(lr[1])) + lr[2],
                             lr[2])]
    scores = []
    mlps = []
    for param in params:
        mlp = MLPClassifier((param[0], 10), max_iter=param[1],
                            learning_rate_init=param[2])
        mean_score = get_k_fold_cross_validation(mlp, data[0], data[1], 2)


        scores.append(mean_score)
        mlps.append(mlp)

    index = np.argmax(scores)

    print("Grid search result:")

    print(params[index])
    print(scores[index])
    return mlps[index]

In [None]:
mlp_model = optimize_parameters(data, [100,130, 10],[200, 400, 100], [0.001, 0.1, 1])

In [None]:
y_m_predict = []

for i in para_test_dict.keys():

    if np.isnan(get_paragraph_vector(i)).any():
        y_m_predict.append(9)
    else:
        y_m_predict.append(mlp_model.predict([get_paragraph_vector(i)]))

In [None]:
y_mlp_predict = []
for i in y_m_predict:
    if isinstance(i, int):
        y_mlp_predict.append(i)    
    else:
        y_mlp_predict.append(i[0])
        

len(y_mlp_predict)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

clf = LDA()
mean_score = get_k_fold_cross_validation(clf, x_data, y_data, 5)

mean_score


In [None]:
y_ld_predict = []

for i in para_test_dict.keys():

    if np.isnan(get_paragraph_vector(i)).any():
        y_ld_predict.append(9)
    else:
        y_ld_predict.append(clf.predict([get_paragraph_vector(i)]))

In [None]:
y_lda_predict = []
for i in y_ld_predict:
    if isinstance(i, int):
        y_lda_predict.append(i)    
    else:
        y_lda_predict.append(i[0])
        

len(y_lda_predict)

## **********

In [None]:
def get_policy_vectors(filepath):

    with open(filepath,'r') as f:

        data = nlp(remove_punctuation(f.read()))
        res = {}
        for sentence in data.sents:
            sentence_sim_vector = calculate_sentence_sim(sentence)
            # if max(sentence_sim_vector) >= 0.65:
            #     print(sentence)
            res[sentence] = sentence_sim_vector

    return res



In [None]:
def get_policy_vectors_sents(filepath):

    with open(filepath,'r') as f:

        data = nlp(remove_punctuation(f.read()))
        res = {}
        for sentence in data.sents:

            sentence_sim_vector = get_sentence_vector(nlp((str(sentence.text))))

            if np.isnan(sentence_sim_vector).any():
                res[sentence] = np.array([0.0]*300)
                continue
            print(type(sentence_sim_vector))
            # if math.isnan(sentence_sim_vector):
            #     continue
            # if max(sentence_sim_vector) >= 0.65:
            #     print(sentence)
            res[sentence] = sentence_sim_vector

    return res

In [None]:
# def get_all_policy_vectors(src_path):
#     all_policies = {}
#     for file in os.listdir(src_path):
#         with open(src_path + '/'+ file, 'r') as f:
#             all_policies[file] = get_all_policy_vectors(src_path + '/'+ file)
#
#     return all_policies



similarity_array = get_policy_vectors('../data/notags_policies/33_nbcuniversal.txt')
vector_array = get_policy_vectors_sents('../data/notags_policies/33_nbcuniversal.txt')

class_array= {}
for k,row in similarity_array.items():
    if max(row) >= 0.65:
        class_array[k] = row.index(max(row)) + 1
    else:
        class_array[k] = 0





# Evaluation

In [None]:
# CLASS_NUM = {'Data Retention' : 0,
#  'Data Security' : 1,
#  'Do Not Track' : 2,
#  'First Party Collection/Use' : 3,
#  'International and Specific Audiences' : 4,
#  'Policy Change' : 5,
#  'Third Party Sharing/Collection' : 6,
#  'User Access, Edit and Deletion' : 7,
#  'User Choice/Control' : 8,
#  'Other' : 9}

### Results: Knearest Neighbors

In [53]:
from sklearn.metrics import precision_recall_fscore_support as score

predicted = y_knn_predict
y_test =list(para_test_dict.values())



precision_knn, recall_knn, fscore_knn, support_knn = score(y_test, predicted)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

NameError: name 'y_knn_predict' is not defined

### Results: MLP Classifier 2 Hidden layers

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score

predicted = y_mlp_predict
y_test =list(para_test_dict.values())



precision_mlp, recall_mlp, fscore_mlp, support_mlp = score(y_test, predicted)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

### Results: MLP Classifier 2 Hidden layers

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score

predicted = y_lda_predict
y_test =list(para_test_dict.values())



precision_lda, recall_lda, fscore_lda, support_lda = score(y_test, predicted)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.title("Precision by different models")
plt.scatter(range(0,10), precision_knn, color='blue', label= 'Knn')
plt.scatter(range(0,10), precision_mlp, color='orange', label = 'MLP' )
plt.scatter(range(0,10), precision_lda, color='green', label="LDA")

plt.xticks([0,1, 2,3,4,5,6,7,8,9],('Data Retention','Data Security', 'Do Not Track', 'First Party Collection/Use', 'International and Specific Audiences',
            'Policy Change','Third Party Sharing/Collection','User Access, Edit and Deletion','User Choice/Control','Other'), rotation=90)

plt.legend()

plt.show()

In [None]:
plt.title("Fscore by different models")
plt.scatter(range(0,10), fscore_knn, color='blue', label= 'Knn')
plt.scatter(range(0,10), fscore_mlp, color='orange', label = 'MLP' )
plt.scatter(range(0,10), fscore_lda, color='green', label="LDA")

plt.xticks([0,1, 2,3,4,5,6,7,8,9],('Data Retention','Data Security', 'Do Not Track', 'First Party Collection/Use', 'International and Specific Audiences',
            'Policy Change','Third Party Sharing/Collection','User Access, Edit and Deletion','User Choice/Control','Other'), rotation=90)

plt.legend()

plt.show()

## Analysis of Summary Data

In [12]:
with open('../data/notags_policies/sample_train/dict0.49543508709194095.txt') as f:
    content = json.loads(f.read())
content.keys()

dict_keys(['First Party Collection/Use', 'Third Party Sharing/Collection', 'User Choice/Control', 'Data Security', 'User Access, Edit and Deletion', 'International and Specific Audiences', 'Data Retention', 'Policy Change', 'Do Not Track', 'Other'])

In [64]:
x_analysis_data = {}
y_analysis_data = {}


for key in content:
    x_a_data = []
    y_a_data = []
    for i in content[key]:
            
        if np.isnan(get_paragraph_vector(i)).any():
            print("Error")
        else:
            x_a_data.append(get_paragraph_vector(i))
            y_a_data.append(content[key][i])
    x_analysis_data[key] = x_a_data
    y_analysis_data[key] = y_a_data
        
# for i in content['User Access, Edit and Deletion'].keys():
    
#     if np.isnan(get_paragraph_vector(i)).any():
#         print("Error")
#     else:
#         x_security_data.append(get_paragraph_vector(i))
#         y_security_data.append(content['User Access, Edit and Deletion'][i])

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Error
Error
Error
Error
Error


In [65]:
all_analysis_mlp = {}
for key in content:
    all_analysis_mlp[key] = optimize_parameters([x_analysis_data[key],y_analysis_data[key]], [100,130, 10],[200, 400, 100], [0.001, 0.1, 1])
    

all_analysis_mlp
    



Grid search result:
[130, 300, 0.001]
0.7040441176470589




Grid search result:
[100, 200, 0.001]
0.9099264705882353




Grid search result:
[100, 300, 0.001]
0.8180147058823529




Grid search result:
[110, 300, 0.1]
0.7573529411764706




Grid search result:
[100, 300, 0.001]
0.9375




Grid search result:
[110, 400, 0.1]
0.8




Grid search result:
[130, 400, 0.1]
0.5192307692307692




Grid search result:
[100, 300, 0.01]
0.775




Grid search result:
[100, 200, 0.001]
0.8333333333333334




Grid search result:
[130, 400, 0.1]
0.7573529411764706


{'First Party Collection/Use': MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
               beta_2=0.999, early_stopping=False, epsilon=1e-08,
               hidden_layer_sizes=(130, 10), learning_rate='constant',
               learning_rate_init=0.001, max_iter=300, momentum=0.9,
               n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
               random_state=None, shuffle=True, solver='adam', tol=0.0001,
               validation_fraction=0.1, verbose=False, warm_start=False),
 'Third Party Sharing/Collection': MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
               beta_2=0.999, early_stopping=False, epsilon=1e-08,
               hidden_layer_sizes=(100, 10), learning_rate='constant',
               learning_rate_init=0.001, max_iter=200, momentum=0.9,
               n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
               random_state=None, shuffle=True, solver='adam', tol=0.

In [66]:
with open('../data/notags_policies/sample_train/dict0.4494910647887381.txt') as f:
    test_content = json.loads(f.read())
predicted_y_analysis ={} 
for key in test_content:
    y_temp_predict = []
    for i in test_content[key].keys():

        if np.isnan(get_paragraph_vector(i)).any():
            y_temp_predict.append(1)
        else:
            y_temp_predict.append(mlp_first_party.predict([get_paragraph_vector(i)]))
    
    predicted_y_analysis[key] = y_temp_predict


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [68]:
predicted_y_a = {}
for key in test_content:
    temp =[]
    for i in predicted_y_analysis[key]:
        if isinstance(i, int):
            temp.append(i)    
        else:
            temp.append(i[0])
    predicted_y_a[key] = temp

    
predicted_y_a


{'First Party Collection/Use': ['1', '1', '1', '1', '0'],
 'Third Party Sharing/Collection': ['0', '1', '1', '0', '0'],
 'User Choice/Control': ['1', '1', '0', '1', '1'],
 'Data Security': ['1', '1', '1', '1', '0'],
 'User Access, Edit and Deletion': ['1', '1', '1', '1', '1'],
 'International and Specific Audiences': ['0', '0', '0', '0', '1'],
 'Data Retention': ['0', '0', '0', '0', '0'],
 'Policy Change': ['0', '0', '1', '0', '0'],
 'Do Not Track': ['0', 1, '0', '0'],
 'Other': ['0', '0', '1', '0', '0']}

In [72]:
for key in content:
    print(score(list(test_content[key].values()), predicted_y_a[key])[0])

# test_content['First Party Collection/Use'].values()

[0.   0.75]
[0.  0.5]
[0.   0.25]
[1. 0.]
[0.  0.4]
[0.75 1.  ]
[0.4 0. ]
[0.75 0.  ]
[0. 0. 0.]
[0.25 0.  ]
