## Privacy Policies Classification
### Author: Valentina Chacon Buitrago

### Imports 

In [1]:
import os
from os import listdir
from os.path import abspath, isfile, join
import numpy as np
import re
from sklearn.datasets import load_files
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Valentina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/Valentina/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Constants

In [None]:
current_file2 = os.path.abspath('privacy_policies')
print(current_file2)
for d in listdir(os.getcwd()):
    print(d)

### Dataset configuration

In [None]:
def build_dataset():
    types = ['legit', 'rogue']
    for i in types:
        
        current_dir = abspath(join('privacy_policies', i))
        
        for f in listdir(current_dir):
            file = join(current_dir,f)
            if isfile(file) and f.endswith('.txt'):
                open_file = open(file, 'r', encoding='windows-1252')
                text_data = open_file.read()
                

build_dataset()
    

### Experiment 1: Bag of words

##### Import Dataset

In [2]:
page_data = load_files(os.path.abspath('privacy_policies'))
x,y = page_data.data, page_data.target
print (x)



##### Text Preprocessing

In [3]:
stemmer = WordNetLemmatizer()

documents = []
for i in range(len(x)):
#for i in range(1):
    # Remove all break line characters
   
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(x[i]))
    
    # Remove all single characters 
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substitute multiple spaces with a single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Remove prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Convert document to lowercase
    document = document.lower()
    
    #document = [stemmer.lemmatize(word) for word in document]
    #document = ' '.join(document)
    #print('7')
    #print(document)
    
    documents.append(document)
    
print(documents[1])

arbor privacy policy nyour right to privacy is very important to us we recognize that when you choose to provide us with information about yourself you trust us to act in responsible manner we believe this information should only be used to help us provide you with better service we do not sell or rent the information you provide to us online to third parties ncollection of personal information nwhen you visit our web site and access information you remain anonymous however when you engage in certain activities on this site such as requesting product information downloading software or entering contests arbor may ask you to provide certain information about yourself such as your name mailing address mail address and other personal identifying information nwhen you submit personal information to arbor you understand and agree that arbor and its partners and trusted vendors may transfer store and process your customer profile in any of the countries in which arbor conducts business inclu

In [4]:
docs = []
for i in range(10):
    docs.append(documents[i])
print(docs)



#### Convert Text to Numbers and find TFIDF

In [5]:
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
x = tfidfconverter.fit_transform(documents).toarray()

#### Training and Testing Sets

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

#### Training Classification Model

In [7]:
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [8]:
y_pred = classifier.predict(x_test)

#### Evaluate the model

In [9]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[17  0]
 [ 7 10]]
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        17
           1       1.00      0.59      0.74        17

    accuracy                           0.79        34
   macro avg       0.85      0.79      0.79        34
weighted avg       0.85      0.79      0.79        34

0.7941176470588235


In [10]:
classifier = RandomForestClassifier(n_estimators=100, random_state=0)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[17  0]
 [ 7 10]]
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        17
           1       1.00      0.59      0.74        17

    accuracy                           0.79        34
   macro avg       0.85      0.79      0.79        34
weighted avg       0.85      0.79      0.79        34

0.7941176470588235


In [11]:
classifier = RandomForestClassifier(n_estimators=10, random_state=0)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[17  0]
 [ 9  8]]
              precision    recall  f1-score   support

           0       0.65      1.00      0.79        17
           1       1.00      0.47      0.64        17

    accuracy                           0.74        34
   macro avg       0.83      0.74      0.72        34
weighted avg       0.83      0.74      0.72        34

0.7352941176470589


In [12]:
classifier = RandomForestClassifier(n_estimators=300, random_state=0)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[17  0]
 [ 7 10]]
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        17
           1       1.00      0.59      0.74        17

    accuracy                           0.79        34
   macro avg       0.85      0.79      0.79        34
weighted avg       0.85      0.79      0.79        34

0.7941176470588235


In [13]:
classifier = RandomForestClassifier(n_estimators=500, random_state=0)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[17  0]
 [ 7 10]]
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        17
           1       1.00      0.59      0.74        17

    accuracy                           0.79        34
   macro avg       0.85      0.79      0.79        34
weighted avg       0.85      0.79      0.79        34

0.7941176470588235


In [14]:
classifier = RandomForestClassifier(n_estimators=900, random_state=0)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[17  0]
 [ 7 10]]
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        17
           1       1.00      0.59      0.74        17

    accuracy                           0.79        34
   macro avg       0.85      0.79      0.79        34
weighted avg       0.85      0.79      0.79        34

0.7941176470588235


In [15]:
classifier = RandomForestClassifier(n_estimators=1200, random_state=0)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[17  0]
 [ 7 10]]
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        17
           1       1.00      0.59      0.74        17

    accuracy                           0.79        34
   macro avg       0.85      0.79      0.79        34
weighted avg       0.85      0.79      0.79        34

0.7941176470588235


In [16]:
classifier = RandomForestClassifier(n_estimators=2000, random_state=0)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[17  0]
 [ 7 10]]
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        17
           1       1.00      0.59      0.74        17

    accuracy                           0.79        34
   macro avg       0.85      0.79      0.79        34
weighted avg       0.85      0.79      0.79        34

0.7941176470588235


In [17]:
classifier = RandomForestClassifier(n_estimators=5000, random_state=0)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[17  0]
 [ 7 10]]
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        17
           1       1.00      0.59      0.74        17

    accuracy                           0.79        34
   macro avg       0.85      0.79      0.79        34
weighted avg       0.85      0.79      0.79        34

0.7941176470588235


#### Experiment 2

In [18]:
tfidfconverter = TfidfVectorizer(max_features=1000, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
x = tfidfconverter.fit_transform(documents).toarray()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[17  0]
 [ 8  9]]
              precision    recall  f1-score   support

           0       0.68      1.00      0.81        17
           1       1.00      0.53      0.69        17

    accuracy                           0.76        34
   macro avg       0.84      0.76      0.75        34
weighted avg       0.84      0.76      0.75        34

0.7647058823529411


In [19]:
tfidfconverter = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
x = tfidfconverter.fit_transform(documents).toarray()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[17  0]
 [ 7 10]]
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        17
           1       1.00      0.59      0.74        17

    accuracy                           0.79        34
   macro avg       0.85      0.79      0.79        34
weighted avg       0.85      0.79      0.79        34

0.7941176470588235


In [20]:
tfidfconverter = TfidfVectorizer(max_features=1000, min_df=1, max_df=0.7, stop_words=stopwords.words('english'))
x = tfidfconverter.fit_transform(documents).toarray()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[17  0]
 [ 6 11]]
              precision    recall  f1-score   support

           0       0.74      1.00      0.85        17
           1       1.00      0.65      0.79        17

    accuracy                           0.82        34
   macro avg       0.87      0.82      0.82        34
weighted avg       0.87      0.82      0.82        34

0.8235294117647058


In [21]:
tfidfconverter = TfidfVectorizer(max_features=1000, min_df=2, max_df=0.7, stop_words=stopwords.words('english'))
x = tfidfconverter.fit_transform(documents).toarray()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[17  0]
 [ 7 10]]
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        17
           1       1.00      0.59      0.74        17

    accuracy                           0.79        34
   macro avg       0.85      0.79      0.79        34
weighted avg       0.85      0.79      0.79        34

0.7941176470588235


In [23]:
tfidfconverter = TfidfVectorizer(max_features=1000, min_df=2, max_df=0.1, stop_words=stopwords.words('english'))
x = tfidfconverter.fit_transform(documents).toarray()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[17  0]
 [ 5 12]]
              precision    recall  f1-score   support

           0       0.77      1.00      0.87        17
           1       1.00      0.71      0.83        17

    accuracy                           0.85        34
   macro avg       0.89      0.85      0.85        34
weighted avg       0.89      0.85      0.85        34

0.8529411764705882


In [24]:
tfidfconverter = TfidfVectorizer(max_features=1000, min_df=2, max_df=0.2, stop_words=stopwords.words('english'))
x = tfidfconverter.fit_transform(documents).toarray()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[17  0]
 [ 4 13]]
              precision    recall  f1-score   support

           0       0.81      1.00      0.89        17
           1       1.00      0.76      0.87        17

    accuracy                           0.88        34
   macro avg       0.90      0.88      0.88        34
weighted avg       0.90      0.88      0.88        34

0.8823529411764706


In [27]:
tfidfconverter = TfidfVectorizer(max_features=1000, min_df=2, stop_words=stopwords.words('english'))
x = tfidfconverter.fit_transform(documents).toarray()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[17  0]
 [ 8  9]]
              precision    recall  f1-score   support

           0       0.68      1.00      0.81        17
           1       1.00      0.53      0.69        17

    accuracy                           0.76        34
   macro avg       0.84      0.76      0.75        34
weighted avg       0.84      0.76      0.75        34

0.7647058823529411


In [28]:
tfidfconverter = TfidfVectorizer(max_features=1000, min_df=0.01, max_df=0.2, stop_words=stopwords.words('english'))
x = tfidfconverter.fit_transform(documents).toarray()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[17  0]
 [ 4 13]]
              precision    recall  f1-score   support

           0       0.81      1.00      0.89        17
           1       1.00      0.76      0.87        17

    accuracy                           0.88        34
   macro avg       0.90      0.88      0.88        34
weighted avg       0.90      0.88      0.88        34

0.8823529411764706


In [29]:
tfidfconverter = TfidfVectorizer(max_features=1000, min_df=0.01, max_df=0.5, stop_words=stopwords.words('english'))
x = tfidfconverter.fit_transform(documents).toarray()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[17  0]
 [ 7 10]]
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        17
           1       1.00      0.59      0.74        17

    accuracy                           0.79        34
   macro avg       0.85      0.79      0.79        34
weighted avg       0.85      0.79      0.79        34

0.7941176470588235


In [30]:
tfidfconverter = TfidfVectorizer(max_features=1000, min_df=0.01, max_df=0.2, stop_words=stopwords.words('english'))
x = tfidfconverter.fit_transform(documents).toarray()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[29  0]
 [ 9 13]]
              precision    recall  f1-score   support

           0       0.76      1.00      0.87        29
           1       1.00      0.59      0.74        22

    accuracy                           0.82        51
   macro avg       0.88      0.80      0.80        51
weighted avg       0.87      0.82      0.81        51

0.8235294117647058
