## Privacy Policies Classification
### Author: Valentina Chacon Buitrago

##### Imports

In [5]:
import os
from os import listdir
from os.path import abspath, isfile, join
import numpy as np
import re
from sklearn.datasets import load_files
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
import pickle
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Valentina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/Valentina/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


##### Import Dataset

In [8]:
page_data = load_files(os.path.abspath('privacy_policies'))
x,y = page_data.data, page_data.target

##### Text Preprocessing

In [9]:
stemmer = WordNetLemmatizer()
documents = []

for i in range(len(x)):
    
    # Remove all break line characters
    document = str(x[i], 'mac_roman')
    document = document.replace('\r', ' ').replace('\n', ' ')

    # Remove all the special characters
    document = re.sub(r'\W', ' ', document)
    
    # Remove all single characters 
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Remove all numbers
    document = re.sub(r'\d+', '', document)
   
    # Substitute multiple spaces with a single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Convert document to lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)

##### Classifiers

In [18]:
classifiers = {
    'Random Forest': RandomForestClassifier,
    'SGD': SGDClassifier,
    'K Neighbors': KNeighborsClassifier,
    'Gaussian Process':GaussianProcessClassifier}

### Experiment 1. Modify single parameters with the default configuration of each classifier
Classifiers being tested are:
- Random Forest
- SGD
- K Neighbors
- Gaussian Process

#### 1.1 Modify just the parameter max_features

In [23]:
features = [10,100,200,300,500,800,1000,1200,2000,5000]

for classifier in classifiers:
    
    print ('----------------------------------')
    print ('Classifier: ' + classifier)
    classifier = classifiers[classifier]()
    
    #TODO: Determine the number of repetitions per configuration
    for i in range(1):
        
        for num in features:
            # Convert text to numbers and find TFIDF
            tfidfconverter = TfidfVectorizer(max_features=num, stop_words=stopwords.words('english'))
            x = tfidfconverter.fit_transform(documents).toarray()

            # Select training and testing sets
            # TODO: Should the division be random or the same number to make it reproducible?
            x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

            # Train the classification model
            classifier.fit(x_train, y_train)
            y_pred = classifier.predict(x_test)

            # Evaluate the model 
            print('max_features ' + str(num))
            print(confusion_matrix(y_test,y_pred))
            print(classification_report(y_test,y_pred))
            print(accuracy_score(y_test, y_pred))

----------------------------------
Classifier: Random Forest
max_features 10
[[24  5]
 [10 12]]
              precision    recall  f1-score   support

           0       0.71      0.83      0.76        29
           1       0.71      0.55      0.62        22

    accuracy                           0.71        51
   macro avg       0.71      0.69      0.69        51
weighted avg       0.71      0.71      0.70        51

0.7058823529411765
max_features 100
[[29  0]
 [11 11]]
              precision    recall  f1-score   support

           0       0.72      1.00      0.84        29
           1       1.00      0.50      0.67        22

    accuracy                           0.78        51
   macro avg       0.86      0.75      0.75        51
weighted avg       0.84      0.78      0.77        51

0.7843137254901961
max_features 200
[[29  0]
 [11 11]]
              precision    recall  f1-score   support

           0       0.72      1.00      0.84        29
           1       1.00      0.

max_features 100
[[26  3]
 [ 6 16]]
              precision    recall  f1-score   support

           0       0.81      0.90      0.85        29
           1       0.84      0.73      0.78        22

    accuracy                           0.82        51
   macro avg       0.83      0.81      0.82        51
weighted avg       0.83      0.82      0.82        51

0.8235294117647058
max_features 200
[[24  5]
 [ 4 18]]
              precision    recall  f1-score   support

           0       0.86      0.83      0.84        29
           1       0.78      0.82      0.80        22

    accuracy                           0.82        51
   macro avg       0.82      0.82      0.82        51
weighted avg       0.82      0.82      0.82        51

0.8235294117647058
max_features 300
[[26  3]
 [ 4 18]]
              precision    recall  f1-score   support

           0       0.87      0.90      0.88        29
           1       0.86      0.82      0.84        22

    accuracy                        

#### 1.2 Modify just the parameter min_df
min_df is used for removing terms that appear too infrequently.

##### 1.2.1  Number of documents
- min_df = 5 means "ignore terms that appear in less than 5 documents".
- The default min_df is 1, which means "ignore terms that appear in less than 1 document". Thus, the default setting does not ignore any terms.

In [31]:
df_min = [1,10,20,30,40,50,60,70,80,90,100,110,120,130,140]

for classifier in classifiers:
    
    print ('----------------------------------')
    print ('Classifier: ' + classifier)
    classifier = classifiers[classifier]()
    
    #TODO: Determine the number of repetitions per configuration
    for i in range(1):
        
        for num in df_min:
            # Convert text to numbers and find TFIDF
            tfidfconverter = TfidfVectorizer(min_df=num, stop_words=stopwords.words('english'))
            x = tfidfconverter.fit_transform(documents).toarray()

            # Select training and testing sets
            # TODO: Should the division be random or the same number to make it reproducible?
            x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

            # Train the classification model
            classifier.fit(x_train, y_train)
            y_pred = classifier.predict(x_test)

            # Evaluate the model 
            print('min_df ' + str(num))
            print(confusion_matrix(y_test,y_pred))
            print(classification_report(y_test,y_pred))
            print(accuracy_score(y_test, y_pred))

----------------------------------
Classifier: Random Forest
min_df 1
[[29  0]
 [14  8]]
              precision    recall  f1-score   support

           0       0.67      1.00      0.81        29
           1       1.00      0.36      0.53        22

    accuracy                           0.73        51
   macro avg       0.84      0.68      0.67        51
weighted avg       0.81      0.73      0.69        51

0.7254901960784313
min_df 10
[[28  1]
 [ 8 14]]
              precision    recall  f1-score   support

           0       0.78      0.97      0.86        29
           1       0.93      0.64      0.76        22

    accuracy                           0.82        51
   macro avg       0.86      0.80      0.81        51
weighted avg       0.84      0.82      0.82        51

0.8235294117647058
min_df 20
[[28  1]
 [ 9 13]]
              precision    recall  f1-score   support

           0       0.76      0.97      0.85        29
           1       0.93      0.59      0.72        2

min_df 70
[[25  4]
 [ 7 15]]
              precision    recall  f1-score   support

           0       0.78      0.86      0.82        29
           1       0.79      0.68      0.73        22

    accuracy                           0.78        51
   macro avg       0.79      0.77      0.78        51
weighted avg       0.78      0.78      0.78        51

0.7843137254901961
min_df 80
[[26  3]
 [10 12]]
              precision    recall  f1-score   support

           0       0.72      0.90      0.80        29
           1       0.80      0.55      0.65        22

    accuracy                           0.75        51
   macro avg       0.76      0.72      0.72        51
weighted avg       0.76      0.75      0.73        51

0.7450980392156863
min_df 90
[[25  4]
 [ 8 14]]
              precision    recall  f1-score   support

           0       0.76      0.86      0.81        29
           1       0.78      0.64      0.70        22

    accuracy                           0.76        51
   

min_df 140
[[23  6]
 [ 7 15]]
              precision    recall  f1-score   support

           0       0.77      0.79      0.78        29
           1       0.71      0.68      0.70        22

    accuracy                           0.75        51
   macro avg       0.74      0.74      0.74        51
weighted avg       0.74      0.75      0.74        51

0.7450980392156863
----------------------------------
Classifier: Gaussian Process
min_df 1
[[29  0]
 [16  6]]
              precision    recall  f1-score   support

           0       0.64      1.00      0.78        29
           1       1.00      0.27      0.43        22

    accuracy                           0.69        51
   macro avg       0.82      0.64      0.61        51
weighted avg       0.80      0.69      0.63        51

0.6862745098039216
min_df 10
[[29  0]
 [13  9]]
              precision    recall  f1-score   support

           0       0.69      1.00      0.82        29
           1       1.00      0.41      0.58     

##### 1.2.2 Percentage of documents
- min_df = 0.01 means "ignore terms that appear in less than 1% of the documents".

In [28]:
df_min = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

for classifier in classifiers:
    
    print ('----------------------------------')
    print ('Classifier: ' + classifier)
    classifier = classifiers[classifier]()
    
    #TODO: Determine the number of repetitions per configuration
    for i in range(1):
        
        for num in df_min:
            # Convert text to numbers and find TFIDF
            tfidfconverter = TfidfVectorizer(min_df=num, stop_words=stopwords.words('english'))
            x = tfidfconverter.fit_transform(documents).toarray()

            # Select training and testing sets
            # TODO: Should the division be random or the same number to make it reproducible?
            x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

            # Train the classification model
            classifier.fit(x_train, y_train)
            y_pred = classifier.predict(x_test)

            # Evaluate the model 
            print('min_df ' + str(num))
            print(confusion_matrix(y_test,y_pred))
            print(classification_report(y_test,y_pred))
            print(accuracy_score(y_test, y_pred))

----------------------------------
Classifier: Random Forest
min_df 0.1
[[29  0]
 [13  9]]
              precision    recall  f1-score   support

           0       0.69      1.00      0.82        29
           1       1.00      0.41      0.58        22

    accuracy                           0.75        51
   macro avg       0.85      0.70      0.70        51
weighted avg       0.82      0.75      0.71        51

0.7450980392156863
min_df 0.2
[[28  1]
 [13  9]]
              precision    recall  f1-score   support

           0       0.68      0.97      0.80        29
           1       0.90      0.41      0.56        22

    accuracy                           0.73        51
   macro avg       0.79      0.69      0.68        51
weighted avg       0.78      0.73      0.70        51

0.7254901960784313
min_df 0.3
[[29  0]
 [10 12]]
              precision    recall  f1-score   support

           0       0.74      1.00      0.85        29
           1       1.00      0.55      0.71     

min_df 0.5
[[25  4]
 [ 8 14]]
              precision    recall  f1-score   support

           0       0.76      0.86      0.81        29
           1       0.78      0.64      0.70        22

    accuracy                           0.76        51
   macro avg       0.77      0.75      0.75        51
weighted avg       0.77      0.76      0.76        51

0.7647058823529411
min_df 0.6
[[26  3]
 [ 8 14]]
              precision    recall  f1-score   support

           0       0.76      0.90      0.83        29
           1       0.82      0.64      0.72        22

    accuracy                           0.78        51
   macro avg       0.79      0.77      0.77        51
weighted avg       0.79      0.78      0.78        51

0.7843137254901961
min_df 0.7
[[26  3]
 [ 8 14]]
              precision    recall  f1-score   support

           0       0.76      0.90      0.83        29
           1       0.82      0.64      0.72        22

    accuracy                           0.78        51


#### 1.3 Modify just the parameter max_df
max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words.

##### 1.3.1  Number of documents
- max_df = 25 means "ignore terms that appear in more than 25 documents".

In [30]:
df_max = [1,10,20,30,40,50,60,70,80,90,100,110,120,130,140]

for classifier in classifiers:
    
    print ('----------------------------------')
    print ('Classifier: ' + classifier)
    classifier = classifiers[classifier]()
    
    #TODO: Determine the number of repetitions per configuration
    for i in range(1):
        
        for num in df_max:
            # Convert text to numbers and find TFIDF
            tfidfconverter = TfidfVectorizer(max_df=num, stop_words=stopwords.words('english'))
            x = tfidfconverter.fit_transform(documents).toarray()

            # Select training and testing sets
            # TODO: Should the division be random or the same number to make it reproducible?
            x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

            # Train the classification model
            classifier.fit(x_train, y_train)
            y_pred = classifier.predict(x_test)

            # Evaluate the model 
            print('max_df ' + str(num))
            print(confusion_matrix(y_test,y_pred))
            print(classification_report(y_test,y_pred))
            print(accuracy_score(y_test, y_pred))

----------------------------------
Classifier: Random Forest
max_df 1
[[ 0 29]
 [ 0 22]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        29
           1       0.43      1.00      0.60        22

    accuracy                           0.43        51
   macro avg       0.22      0.50      0.30        51
weighted avg       0.19      0.43      0.26        51

0.43137254901960786


  _warn_prf(average, modifier, msg_start, len(result))


max_df 10
[[ 8 21]
 [ 0 22]]
              precision    recall  f1-score   support

           0       1.00      0.28      0.43        29
           1       0.51      1.00      0.68        22

    accuracy                           0.59        51
   macro avg       0.76      0.64      0.55        51
weighted avg       0.79      0.59      0.54        51

0.5882352941176471
max_df 20
[[29  0]
 [15  7]]
              precision    recall  f1-score   support

           0       0.66      1.00      0.79        29
           1       1.00      0.32      0.48        22

    accuracy                           0.71        51
   macro avg       0.83      0.66      0.64        51
weighted avg       0.81      0.71      0.66        51

0.7058823529411765
max_df 30
[[29  0]
 [15  7]]
              precision    recall  f1-score   support

           0       0.66      1.00      0.79        29
           1       1.00      0.32      0.48        22

    accuracy                           0.71        51
   

  _warn_prf(average, modifier, msg_start, len(result))


max_df 10
[[10 19]
 [ 0 22]]
              precision    recall  f1-score   support

           0       1.00      0.34      0.51        29
           1       0.54      1.00      0.70        22

    accuracy                           0.63        51
   macro avg       0.77      0.67      0.61        51
weighted avg       0.80      0.63      0.59        51

0.6274509803921569
max_df 20
[[29  0]
 [13  9]]
              precision    recall  f1-score   support

           0       0.69      1.00      0.82        29
           1       1.00      0.41      0.58        22

    accuracy                           0.75        51
   macro avg       0.85      0.70      0.70        51
weighted avg       0.82      0.75      0.71        51

0.7450980392156863
max_df 30
[[29  0]
 [10 12]]
              precision    recall  f1-score   support

           0       0.74      1.00      0.85        29
           1       1.00      0.55      0.71        22

    accuracy                           0.80        51
   

  _warn_prf(average, modifier, msg_start, len(result))


max_df 10
[[ 0 29]
 [ 0 22]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        29
           1       0.43      1.00      0.60        22

    accuracy                           0.43        51
   macro avg       0.22      0.50      0.30        51
weighted avg       0.19      0.43      0.26        51

0.43137254901960786


  _warn_prf(average, modifier, msg_start, len(result))


max_df 20
[[28  1]
 [ 8 14]]
              precision    recall  f1-score   support

           0       0.78      0.97      0.86        29
           1       0.93      0.64      0.76        22

    accuracy                           0.82        51
   macro avg       0.86      0.80      0.81        51
weighted avg       0.84      0.82      0.82        51

0.8235294117647058
max_df 30
[[27  2]
 [ 8 14]]
              precision    recall  f1-score   support

           0       0.77      0.93      0.84        29
           1       0.88      0.64      0.74        22

    accuracy                           0.80        51
   macro avg       0.82      0.78      0.79        51
weighted avg       0.82      0.80      0.80        51

0.803921568627451
max_df 40
[[27  2]
 [ 7 15]]
              precision    recall  f1-score   support

           0       0.79      0.93      0.86        29
           1       0.88      0.68      0.77        22

    accuracy                           0.82        51
   m

  _warn_prf(average, modifier, msg_start, len(result))


max_df 10
[[29  0]
 [18  4]]
              precision    recall  f1-score   support

           0       0.62      1.00      0.76        29
           1       1.00      0.18      0.31        22

    accuracy                           0.65        51
   macro avg       0.81      0.59      0.54        51
weighted avg       0.78      0.65      0.57        51

0.6470588235294118
max_df 20
[[29  0]
 [16  6]]
              precision    recall  f1-score   support

           0       0.64      1.00      0.78        29
           1       1.00      0.27      0.43        22

    accuracy                           0.69        51
   macro avg       0.82      0.64      0.61        51
weighted avg       0.80      0.69      0.63        51

0.6862745098039216
max_df 30
[[29  0]
 [16  6]]
              precision    recall  f1-score   support

           0       0.64      1.00      0.78        29
           1       1.00      0.27      0.43        22

    accuracy                           0.69        51
   

##### 1.3.2 Percentage of documents
- max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
- The default max_df is 1.0, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms.

In [32]:
df_max = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

for classifier in classifiers:
    
    print ('----------------------------------')
    print ('Classifier: ' + classifier)
    classifier = classifiers[classifier]()
    
    #TODO: Determine the number of repetitions per configuration
    for i in range(1):
        
        for num in df_max:
            # Convert text to numbers and find TFIDF
            tfidfconverter = TfidfVectorizer(max_df=num, stop_words=stopwords.words('english'))
            x = tfidfconverter.fit_transform(documents).toarray()

            # Select training and testing sets
            # TODO: Should the division be random or the same number to make it reproducible?
            x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

            # Train the classification model
            classifier.fit(x_train, y_train)
            y_pred = classifier.predict(x_test)

            # Evaluate the model 
            print('max_df ' + str(num))
            print(confusion_matrix(y_test,y_pred))
            print(classification_report(y_test,y_pred))
            print(accuracy_score(y_test, y_pred))

----------------------------------
Classifier: Random Forest
max_df 0.1
[[29  0]
 [15  7]]
              precision    recall  f1-score   support

           0       0.66      1.00      0.79        29
           1       1.00      0.32      0.48        22

    accuracy                           0.71        51
   macro avg       0.83      0.66      0.64        51
weighted avg       0.81      0.71      0.66        51

0.7058823529411765
max_df 0.2
[[29  0]
 [15  7]]
              precision    recall  f1-score   support

           0       0.66      1.00      0.79        29
           1       1.00      0.32      0.48        22

    accuracy                           0.71        51
   macro avg       0.83      0.66      0.64        51
weighted avg       0.81      0.71      0.66        51

0.7058823529411765
max_df 0.3
[[29  0]
 [15  7]]
              precision    recall  f1-score   support

           0       0.66      1.00      0.79        29
           1       1.00      0.32      0.48     

max_df 0.5
[[27  2]
 [ 4 18]]
              precision    recall  f1-score   support

           0       0.87      0.93      0.90        29
           1       0.90      0.82      0.86        22

    accuracy                           0.88        51
   macro avg       0.89      0.87      0.88        51
weighted avg       0.88      0.88      0.88        51

0.8823529411764706
max_df 0.6
[[28  1]
 [ 4 18]]
              precision    recall  f1-score   support

           0       0.88      0.97      0.92        29
           1       0.95      0.82      0.88        22

    accuracy                           0.90        51
   macro avg       0.91      0.89      0.90        51
weighted avg       0.91      0.90      0.90        51

0.9019607843137255
max_df 0.7
[[28  1]
 [ 5 17]]
              precision    recall  f1-score   support

           0       0.85      0.97      0.90        29
           1       0.94      0.77      0.85        22

    accuracy                           0.88        51


### Experiment 2. Combine multiple parameters with the default configuration of each classifier