## Privacy Policies Classification
### Author: Valentina Chacon Buitrago

### Imports 

In [1]:
import os
from os import listdir
from os.path import abspath, isfile, join
import numpy as np
import re
from sklearn.datasets import load_files
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import pickle
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Valentina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/Valentina/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Dataset configuration

In [None]:
def build_dataset():
    types = ['legit', 'rogue']
    for i in types:
        
        current_dir = abspath(join('privacy_policies', i))
        
        for f in listdir(current_dir):
            file = join(current_dir,f)
            if isfile(file) and f.endswith('.txt'):
                open_file = open(file, 'r', encoding='windows-1252')
                text_data = open_file.read()
                
build_dataset()

##### Import Dataset

In [2]:
page_data = load_files(os.path.abspath('privacy_policies'))
x,y = page_data.data, page_data.target

##### Text Preprocessing

In [3]:
stemmer = WordNetLemmatizer()
documents = []

for i in range(len(x)):
    
    # Remove all break line characters
    document = str(x[i], 'mac_roman')
    document = document.replace('\r', ' ').replace('\n', ' ')

    # Remove all the special characters
    document = re.sub(r'\W', ' ', document)
    
    # Remove all single characters 
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Remove all numbers
    document = re.sub(r'\d+', '', document)
   
    # Substitute multiple spaces with a single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Convert document to lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)

### Experiment 1. Bag of words - Random Forest Classifier

#### 1.1 Modify just the parameter n_estimators

In [4]:
# Convert text to numbers and find TFIDF
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
x = tfidfconverter.fit_transform(documents).toarray()

# Select training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

estimators = [10,100,200,300,500,800,1000,1200,2000,5000]
for i in range(len(estimators)):
# Train the classification model
    classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    # Evaluate the model 
    print('n_estimators ' + str(estimators[i]))
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(accuracy_score(y_test, y_pred))

n_estimators 10
[[29  0]
 [12 10]]
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        29
           1       1.00      0.45      0.62        22

    accuracy                           0.76        51
   macro avg       0.85      0.73      0.73        51
weighted avg       0.83      0.76      0.74        51

0.7647058823529411
n_estimators 100
[[29  0]
 [12 10]]
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        29
           1       1.00      0.45      0.62        22

    accuracy                           0.76        51
   macro avg       0.85      0.73      0.73        51
weighted avg       0.83      0.76      0.74        51

0.7647058823529411
n_estimators 200
[[29  0]
 [12 10]]
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        29
           1       1.00      0.45      0.62        22

    accuracy                         

#### 1.2 Modify just the parameter max_features

In [5]:
features = [10,100,200,300,500,800,1000,1200,2000,5000]

for i in range(len(features)):
    # Convert text to numbers and find TFIDF
    tfidfconverter = TfidfVectorizer(max_features=features[i], min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
    x = tfidfconverter.fit_transform(documents).toarray()

    # Select training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

    # Train the classification model
    classifier = RandomForestClassifier(n_estimators=100, random_state=0)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    # Evaluate the model 
    print('max_features ' + str(features[i]))
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(accuracy_score(y_test, y_pred))

max_features 10
[[24  5]
 [10 12]]
              precision    recall  f1-score   support

           0       0.71      0.83      0.76        29
           1       0.71      0.55      0.62        22

    accuracy                           0.71        51
   macro avg       0.71      0.69      0.69        51
weighted avg       0.71      0.71      0.70        51

0.7058823529411765
max_features 100
[[29  0]
 [10 12]]
              precision    recall  f1-score   support

           0       0.74      1.00      0.85        29
           1       1.00      0.55      0.71        22

    accuracy                           0.80        51
   macro avg       0.87      0.77      0.78        51
weighted avg       0.85      0.80      0.79        51

0.803921568627451
max_features 200
[[28  1]
 [12 10]]
              precision    recall  f1-score   support

           0       0.70      0.97      0.81        29
           1       0.91      0.45      0.61        22

    accuracy                          

#### 1.3 Modify just the parameter min_df
min_df is used for removing terms that appear too infrequently.
- min_df = 5 means "ignore terms that appear in less than 5 documents".
- The default min_df is 1, which means "ignore terms that appear in less than 1 document". Thus, the default setting does not ignore any terms.

In [6]:
df_min = [1,10,20,30,40,50,80,90]

for i in range(len(df_min)):
    # Convert text to numbers and find TFIDF
    tfidfconverter = TfidfVectorizer(max_features=300, min_df=df_min[i], max_df=0.7, stop_words=stopwords.words('english'))
    x = tfidfconverter.fit_transform(documents).toarray()

    # Select training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

    # Train the classification model
    classifier = RandomForestClassifier(n_estimators=100, random_state=0)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    # Evaluate the model 
    print('min_df ' + str(df_min[i]))
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(accuracy_score(y_test, y_pred))

min_df 1
[[29  0]
 [10 12]]
              precision    recall  f1-score   support

           0       0.74      1.00      0.85        29
           1       1.00      0.55      0.71        22

    accuracy                           0.80        51
   macro avg       0.87      0.77      0.78        51
weighted avg       0.85      0.80      0.79        51

0.803921568627451
min_df 10
[[28  1]
 [13  9]]
              precision    recall  f1-score   support

           0       0.68      0.97      0.80        29
           1       0.90      0.41      0.56        22

    accuracy                           0.73        51
   macro avg       0.79      0.69      0.68        51
weighted avg       0.78      0.73      0.70        51

0.7254901960784313
min_df 20
[[29  0]
 [11 11]]
              precision    recall  f1-score   support

           0       0.72      1.00      0.84        29
           1       1.00      0.50      0.67        22

    accuracy                           0.78        51
   ma

In [7]:
df_min = [0.1,0.2,0.3,0.4,0.5,0.6]

for i in range(len(df_min)):
    # Convert text to numbers and find TFIDF
    tfidfconverter = TfidfVectorizer(max_features=300, min_df=df_min[i], max_df=0.7, stop_words=stopwords.words('english'))
    x = tfidfconverter.fit_transform(documents).toarray()

    # Select training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

    # Train the classification model
    classifier = RandomForestClassifier(n_estimators=100, random_state=0)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    # Evaluate the model 
    print('min_df ' + str(df_min[i]))
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(accuracy_score(y_test, y_pred))

min_df 0.1
[[29  0]
 [11 11]]
              precision    recall  f1-score   support

           0       0.72      1.00      0.84        29
           1       1.00      0.50      0.67        22

    accuracy                           0.78        51
   macro avg       0.86      0.75      0.75        51
weighted avg       0.84      0.78      0.77        51

0.7843137254901961
min_df 0.2
[[29  0]
 [11 11]]
              precision    recall  f1-score   support

           0       0.72      1.00      0.84        29
           1       1.00      0.50      0.67        22

    accuracy                           0.78        51
   macro avg       0.86      0.75      0.75        51
weighted avg       0.84      0.78      0.77        51

0.7843137254901961
min_df 0.3
[[29  0]
 [11 11]]
              precision    recall  f1-score   support

           0       0.72      1.00      0.84        29
           1       1.00      0.50      0.67        22

    accuracy                           0.78        51


#### 1.4 Modify just the parameter max_df
max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words.
- max_df = 25 means "ignore terms that appear in more than 25 documents".

In [8]:
df_max = [1,10,20,30,40,50,80,90]

for i in range(len(df_max)):
    # Convert text to numbers and find TFIDF
    tfidfconverter = TfidfVectorizer(max_features=300, min_df=1, max_df=df_max[i], stop_words=stopwords.words('english'))
    x = tfidfconverter.fit_transform(documents).toarray()

    # Select training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

    # Train the classification model
    classifier = RandomForestClassifier(n_estimators=100, random_state=0)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    # Evaluate the model 
    print('max_df ' + str(df_max[i]))
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(accuracy_score(y_test, y_pred))

max_df 1
[[ 0 29]
 [ 0 22]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        29
           1       0.43      1.00      0.60        22

    accuracy                           0.43        51
   macro avg       0.22      0.50      0.30        51
weighted avg       0.19      0.43      0.26        51

0.43137254901960786


  _warn_prf(average, modifier, msg_start, len(result))


max_df 10
[[12 17]
 [ 1 21]]
              precision    recall  f1-score   support

           0       0.92      0.41      0.57        29
           1       0.55      0.95      0.70        22

    accuracy                           0.65        51
   macro avg       0.74      0.68      0.64        51
weighted avg       0.76      0.65      0.63        51

0.6470588235294118
max_df 20
[[29  0]
 [10 12]]
              precision    recall  f1-score   support

           0       0.74      1.00      0.85        29
           1       1.00      0.55      0.71        22

    accuracy                           0.80        51
   macro avg       0.87      0.77      0.78        51
weighted avg       0.85      0.80      0.79        51

0.803921568627451
max_df 30
[[29  0]
 [10 12]]
              precision    recall  f1-score   support

           0       0.74      1.00      0.85        29
           1       1.00      0.55      0.71        22

    accuracy                           0.80        51
   m

- max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
- The default max_df is 1.0, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms.

In [9]:
df_max = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

for i in range(len(df_max)):
    # Convert text to numbers and find TFIDF
    tfidfconverter = TfidfVectorizer(max_features=300, min_df=0.1, max_df=df_max[i], stop_words=stopwords.words('english'))
    x = tfidfconverter.fit_transform(documents).toarray()

    # Select training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

    # Train the classification model
    classifier = RandomForestClassifier(n_estimators=100, random_state=0)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    # Evaluate the model 
    print('max_df ' + str(df_max[i]))
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(accuracy_score(y_test, y_pred))

max_df 0.1
[[20  9]
 [ 8 14]]
              precision    recall  f1-score   support

           0       0.71      0.69      0.70        29
           1       0.61      0.64      0.62        22

    accuracy                           0.67        51
   macro avg       0.66      0.66      0.66        51
weighted avg       0.67      0.67      0.67        51

0.6666666666666666
max_df 0.2
[[26  3]
 [12 10]]
              precision    recall  f1-score   support

           0       0.68      0.90      0.78        29
           1       0.77      0.45      0.57        22

    accuracy                           0.71        51
   macro avg       0.73      0.68      0.67        51
weighted avg       0.72      0.71      0.69        51

0.7058823529411765
max_df 0.3
[[27  2]
 [ 9 13]]
              precision    recall  f1-score   support

           0       0.75      0.93      0.83        29
           1       0.87      0.59      0.70        22

    accuracy                           0.78        51


### Experiment 2. Bag of words - SGD Classifier

#### 2.1 Modify just the parameter loss

In [10]:
# Convert text to numbers and find TFIDF
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
x = tfidfconverter.fit_transform(documents).toarray()

# Select training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

loss = ['hinge','log','modified_huber','squared_hinge','perceptron','squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']
for i in range(len(loss)):
# Train the classification model
    classifier = SGDClassifier(loss=loss[i])
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    # Evaluate the model 
    print('loss ' + str(loss[i]))
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(accuracy_score(y_test, y_pred))

loss hinge
[[28  1]
 [ 5 17]]
              precision    recall  f1-score   support

           0       0.85      0.97      0.90        29
           1       0.94      0.77      0.85        22

    accuracy                           0.88        51
   macro avg       0.90      0.87      0.88        51
weighted avg       0.89      0.88      0.88        51

0.8823529411764706
loss log
[[28  1]
 [ 7 15]]
              precision    recall  f1-score   support

           0       0.80      0.97      0.88        29
           1       0.94      0.68      0.79        22

    accuracy                           0.84        51
   macro avg       0.87      0.82      0.83        51
weighted avg       0.86      0.84      0.84        51

0.8431372549019608
loss modified_huber
[[29  0]
 [12 10]]
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        29
           1       1.00      0.45      0.62        22

    accuracy                           0.76    

#### 2.2 Modify just the parameter max_features

In [11]:
features = [10,100,200,300,500,800,1000,1200,2000,5000]

for i in range(len(features)):
    # Convert text to numbers and find TFIDF
    tfidfconverter = TfidfVectorizer(max_features=features[i], min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
    x = tfidfconverter.fit_transform(documents).toarray()

    # Select training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

    # Train the classification model, by default classifier has parameter loss='hinge'
    classifier = SGDClassifier()
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    # Evaluate the model 
    print('max_features ' + str(features[i]))
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(accuracy_score(y_test, y_pred))

max_features 10
[[12 17]
 [ 2 20]]
              precision    recall  f1-score   support

           0       0.86      0.41      0.56        29
           1       0.54      0.91      0.68        22

    accuracy                           0.63        51
   macro avg       0.70      0.66      0.62        51
weighted avg       0.72      0.63      0.61        51

0.6274509803921569
max_features 100
[[25  4]
 [ 9 13]]
              precision    recall  f1-score   support

           0       0.74      0.86      0.79        29
           1       0.76      0.59      0.67        22

    accuracy                           0.75        51
   macro avg       0.75      0.73      0.73        51
weighted avg       0.75      0.75      0.74        51

0.7450980392156863
max_features 200
[[21  8]
 [ 4 18]]
              precision    recall  f1-score   support

           0       0.84      0.72      0.78        29
           1       0.69      0.82      0.75        22

    accuracy                         

#### 2.3 Modify just the parameter min_df
min_df is used for removing terms that appear too infrequently.
- min_df = 5 means "ignore terms that appear in less than 5 documents".
- The default min_df is 1, which means "ignore terms that appear in less than 1 document". Thus, the default setting does not ignore any terms.

In [12]:
df_min = [1,10,20,30,40,50,80,90]

for i in range(len(df_min)):
    # Convert text to numbers and find TFIDF
    tfidfconverter = TfidfVectorizer(max_features=300, min_df=df_min[i], max_df=0.7, stop_words=stopwords.words('english'))
    x = tfidfconverter.fit_transform(documents).toarray()

    # Select training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

    # Train the classification model
    classifier = SGDClassifier()
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    # Evaluate the model 
    print('min_df ' + str(df_min[i]))
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(accuracy_score(y_test, y_pred))

min_df 1
[[25  4]
 [ 5 17]]
              precision    recall  f1-score   support

           0       0.83      0.86      0.85        29
           1       0.81      0.77      0.79        22

    accuracy                           0.82        51
   macro avg       0.82      0.82      0.82        51
weighted avg       0.82      0.82      0.82        51

0.8235294117647058
min_df 10
[[25  4]
 [ 5 17]]
              precision    recall  f1-score   support

           0       0.83      0.86      0.85        29
           1       0.81      0.77      0.79        22

    accuracy                           0.82        51
   macro avg       0.82      0.82      0.82        51
weighted avg       0.82      0.82      0.82        51

0.8235294117647058
min_df 20
[[27  2]
 [ 6 16]]
              precision    recall  f1-score   support

           0       0.82      0.93      0.87        29
           1       0.89      0.73      0.80        22

    accuracy                           0.84        51
   m

In [13]:
df_min = [0.1,0.2,0.3,0.4,0.5,0.6]

for i in range(len(df_min)):
    # Convert text to numbers and find TFIDF
    tfidfconverter = TfidfVectorizer(max_features=300, min_df=df_min[i], max_df=0.7, stop_words=stopwords.words('english'))
    x = tfidfconverter.fit_transform(documents).toarray()

    # Select training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

    # Train the classification model
    classifier = SGDClassifier()
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    # Evaluate the model 
    print('min_df ' + str(df_min[i]))
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(accuracy_score(y_test, y_pred))

min_df 0.1
[[16 13]
 [ 1 21]]
              precision    recall  f1-score   support

           0       0.94      0.55      0.70        29
           1       0.62      0.95      0.75        22

    accuracy                           0.73        51
   macro avg       0.78      0.75      0.72        51
weighted avg       0.80      0.73      0.72        51

0.7254901960784313
min_df 0.2
[[23  6]
 [ 3 19]]
              precision    recall  f1-score   support

           0       0.88      0.79      0.84        29
           1       0.76      0.86      0.81        22

    accuracy                           0.82        51
   macro avg       0.82      0.83      0.82        51
weighted avg       0.83      0.82      0.82        51

0.8235294117647058
min_df 0.3
[[23  6]
 [ 5 17]]
              precision    recall  f1-score   support

           0       0.82      0.79      0.81        29
           1       0.74      0.77      0.76        22

    accuracy                           0.78        51


#### 2.4 Modify just the parameter max_df
max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words.
- max_df = 25 means "ignore terms that appear in more than 25 documents".

In [14]:
df_max = [1,10,20,30,40,50,80,90]

for i in range(len(df_max)):
    # Convert text to numbers and find TFIDF
    tfidfconverter = TfidfVectorizer(max_features=300, min_df=1, max_df=df_max[i], stop_words=stopwords.words('english'))
    x = tfidfconverter.fit_transform(documents).toarray()

    # Select training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

    # Train the classification model
    classifier = SGDClassifier()
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    # Evaluate the model 
    print('max_df ' + str(df_max[i]))
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(accuracy_score(y_test, y_pred))

max_df 1
[[ 0 29]
 [ 0 22]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        29
           1       0.43      1.00      0.60        22

    accuracy                           0.43        51
   macro avg       0.22      0.50      0.30        51
weighted avg       0.19      0.43      0.26        51

0.43137254901960786


  _warn_prf(average, modifier, msg_start, len(result))


max_df 10
[[13 16]
 [ 1 21]]
              precision    recall  f1-score   support

           0       0.93      0.45      0.60        29
           1       0.57      0.95      0.71        22

    accuracy                           0.67        51
   macro avg       0.75      0.70      0.66        51
weighted avg       0.77      0.67      0.65        51

0.6666666666666666
max_df 20
[[11 18]
 [ 0 22]]
              precision    recall  f1-score   support

           0       1.00      0.38      0.55        29
           1       0.55      1.00      0.71        22

    accuracy                           0.65        51
   macro avg       0.78      0.69      0.63        51
weighted avg       0.81      0.65      0.62        51

0.6470588235294118
max_df 30
[[25  4]
 [ 8 14]]
              precision    recall  f1-score   support

           0       0.76      0.86      0.81        29
           1       0.78      0.64      0.70        22

    accuracy                           0.76        51
   

In [16]:
df_max = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

for i in range(len(df_max)):
    # Convert text to numbers and find TFIDF
    tfidfconverter = TfidfVectorizer(max_features=300, min_df=0.1, max_df=df_max[i], stop_words=stopwords.words('english'))
    x = tfidfconverter.fit_transform(documents).toarray()

    # Select training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

    # Train the classification model
    classifier = SGDClassifier()
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    # Evaluate the model 
    print('max_df ' + str(df_max[i]))
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(accuracy_score(y_test, y_pred))

max_df 0.1
[[18 11]
 [ 4 18]]
              precision    recall  f1-score   support

           0       0.82      0.62      0.71        29
           1       0.62      0.82      0.71        22

    accuracy                           0.71        51
   macro avg       0.72      0.72      0.71        51
weighted avg       0.73      0.71      0.71        51

0.7058823529411765
max_df 0.2
[[22  7]
 [ 6 16]]
              precision    recall  f1-score   support

           0       0.79      0.76      0.77        29
           1       0.70      0.73      0.71        22

    accuracy                           0.75        51
   macro avg       0.74      0.74      0.74        51
weighted avg       0.75      0.75      0.75        51

0.7450980392156863
max_df 0.3
[[24  5]
 [ 5 17]]
              precision    recall  f1-score   support

           0       0.83      0.83      0.83        29
           1       0.77      0.77      0.77        22

    accuracy                           0.80        51


### Experiment 3. Bag of words - K Neighbors Classifier

#### 3.1 Modify just the parameter n_neighbors

In [20]:
# Convert text to numbers and find TFIDF
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
x = tfidfconverter.fit_transform(documents).toarray()

# Select training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

neighbors = [1,3,5,8,10,20,50,80,100]
for i in range(len(neighbors)):
# Train the classification model
    classifier = KNeighborsClassifier(n_neighbors=neighbors[i])
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    # Evaluate the model 
    print('neighbors ' + str(neighbors[i]))
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(accuracy_score(y_test, y_pred))

neighbors 1
[[21  8]
 [ 5 17]]
              precision    recall  f1-score   support

           0       0.81      0.72      0.76        29
           1       0.68      0.77      0.72        22

    accuracy                           0.75        51
   macro avg       0.74      0.75      0.74        51
weighted avg       0.75      0.75      0.75        51

0.7450980392156863
neighbors 3
[[28  1]
 [14  8]]
              precision    recall  f1-score   support

           0       0.67      0.97      0.79        29
           1       0.89      0.36      0.52        22

    accuracy                           0.71        51
   macro avg       0.78      0.66      0.65        51
weighted avg       0.76      0.71      0.67        51

0.7058823529411765
neighbors 5
[[28  1]
 [11 11]]
              precision    recall  f1-score   support

           0       0.72      0.97      0.82        29
           1       0.92      0.50      0.65        22

    accuracy                           0.76        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### 3.2 Modify just the parameter max_features

In [21]:
features = [10,100,200,300,500,800,1000,1200,2000,5000]

for i in range(len(features)):
    # Convert text to numbers and find TFIDF
    tfidfconverter = TfidfVectorizer(max_features=features[i], min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
    x = tfidfconverter.fit_transform(documents).toarray()

    # Select training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

    # Train the classification model, by default classifier has parameter loss='hinge'
    classifier = KNeighborsClassifier(n_neighbors=8)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    # Evaluate the model 
    print('max_features ' + str(features[i]))
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(accuracy_score(y_test, y_pred))

max_features 10
[[25  4]
 [ 8 14]]
              precision    recall  f1-score   support

           0       0.76      0.86      0.81        29
           1       0.78      0.64      0.70        22

    accuracy                           0.76        51
   macro avg       0.77      0.75      0.75        51
weighted avg       0.77      0.76      0.76        51

0.7647058823529411
max_features 100
[[28  1]
 [10 12]]
              precision    recall  f1-score   support

           0       0.74      0.97      0.84        29
           1       0.92      0.55      0.69        22

    accuracy                           0.78        51
   macro avg       0.83      0.76      0.76        51
weighted avg       0.82      0.78      0.77        51

0.7843137254901961
max_features 200
[[29  0]
 [10 12]]
              precision    recall  f1-score   support

           0       0.74      1.00      0.85        29
           1       1.00      0.55      0.71        22

    accuracy                         

#### 3.3 Modify just the parameter min_df
min_df is used for removing terms that appear too infrequently.
- min_df = 5 means "ignore terms that appear in less than 5 documents".
- The default min_df is 1, which means "ignore terms that appear in less than 1 document". Thus, the default setting does not ignore any terms.

In [22]:
df_min = [1,10,20,30,40,50,80,90]

for i in range(len(df_min)):
    # Convert text to numbers and find TFIDF
    tfidfconverter = TfidfVectorizer(max_features=300, min_df=df_min[i], max_df=0.7, stop_words=stopwords.words('english'))
    x = tfidfconverter.fit_transform(documents).toarray()

    # Select training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

    # Train the classification model
    classifier = KNeighborsClassifier(n_neighbors=8)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    # Evaluate the model 
    print('min_df ' + str(df_min[i]))
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(accuracy_score(y_test, y_pred))

min_df 1
[[28  1]
 [ 8 14]]
              precision    recall  f1-score   support

           0       0.78      0.97      0.86        29
           1       0.93      0.64      0.76        22

    accuracy                           0.82        51
   macro avg       0.86      0.80      0.81        51
weighted avg       0.84      0.82      0.82        51

0.8235294117647058
min_df 10
[[29  0]
 [10 12]]
              precision    recall  f1-score   support

           0       0.74      1.00      0.85        29
           1       1.00      0.55      0.71        22

    accuracy                           0.80        51
   macro avg       0.87      0.77      0.78        51
weighted avg       0.85      0.80      0.79        51

0.803921568627451
min_df 20
[[29  0]
 [11 11]]
              precision    recall  f1-score   support

           0       0.72      1.00      0.84        29
           1       1.00      0.50      0.67        22

    accuracy                           0.78        51
   ma

In [23]:
df_min = [0.1,0.2,0.3,0.4,0.5,0.6]

for i in range(len(df_min)):
    # Convert text to numbers and find TFIDF
    tfidfconverter = TfidfVectorizer(max_features=300, min_df=df_min[i], max_df=0.7, stop_words=stopwords.words('english'))
    x = tfidfconverter.fit_transform(documents).toarray()

    # Select training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

    # Train the classification model
    classifier = KNeighborsClassifier(n_neighbors=8)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    # Evaluate the model 
    print('min_df ' + str(df_min[i]))
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(accuracy_score(y_test, y_pred))

min_df 0.1
[[29  0]
 [11 11]]
              precision    recall  f1-score   support

           0       0.72      1.00      0.84        29
           1       1.00      0.50      0.67        22

    accuracy                           0.78        51
   macro avg       0.86      0.75      0.75        51
weighted avg       0.84      0.78      0.77        51

0.7843137254901961
min_df 0.2
[[29  0]
 [11 11]]
              precision    recall  f1-score   support

           0       0.72      1.00      0.84        29
           1       1.00      0.50      0.67        22

    accuracy                           0.78        51
   macro avg       0.86      0.75      0.75        51
weighted avg       0.84      0.78      0.77        51

0.7843137254901961
min_df 0.3
[[29  0]
 [11 11]]
              precision    recall  f1-score   support

           0       0.72      1.00      0.84        29
           1       1.00      0.50      0.67        22

    accuracy                           0.78        51


#### 3.4 Modify just the parameter max_df
max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words.
- max_df = 25 means "ignore terms that appear in more than 25 documents".

In [24]:
df_max = [1,10,20,30,40,50,80,90]

for i in range(len(df_max)):
    # Convert text to numbers and find TFIDF
    tfidfconverter = TfidfVectorizer(max_features=300, min_df=1, max_df=df_max[i], stop_words=stopwords.words('english'))
    x = tfidfconverter.fit_transform(documents).toarray()

    # Select training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

    # Train the classification model
    classifier = KNeighborsClassifier(n_neighbors=8)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    # Evaluate the model 
    print('max_df ' + str(df_max[i]))
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(accuracy_score(y_test, y_pred))

max_df 1
[[ 0 29]
 [ 0 22]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        29
           1       0.43      1.00      0.60        22

    accuracy                           0.43        51
   macro avg       0.22      0.50      0.30        51
weighted avg       0.19      0.43      0.26        51

0.43137254901960786


  _warn_prf(average, modifier, msg_start, len(result))


max_df 10
[[ 0 29]
 [ 0 22]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        29
           1       0.43      1.00      0.60        22

    accuracy                           0.43        51
   macro avg       0.22      0.50      0.30        51
weighted avg       0.19      0.43      0.26        51

0.43137254901960786


  _warn_prf(average, modifier, msg_start, len(result))


max_df 20
[[29  0]
 [13  9]]
              precision    recall  f1-score   support

           0       0.69      1.00      0.82        29
           1       1.00      0.41      0.58        22

    accuracy                           0.75        51
   macro avg       0.85      0.70      0.70        51
weighted avg       0.82      0.75      0.71        51

0.7450980392156863
max_df 30
[[27  2]
 [12 10]]
              precision    recall  f1-score   support

           0       0.69      0.93      0.79        29
           1       0.83      0.45      0.59        22

    accuracy                           0.73        51
   macro avg       0.76      0.69      0.69        51
weighted avg       0.75      0.73      0.71        51

0.7254901960784313
max_df 40
[[29  0]
 [10 12]]
              precision    recall  f1-score   support

           0       0.74      1.00      0.85        29
           1       1.00      0.55      0.71        22

    accuracy                           0.80        51
   

In [25]:
df_max = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

for i in range(len(df_max)):
    # Convert text to numbers and find TFIDF
    tfidfconverter = TfidfVectorizer(max_features=300, min_df=0.1, max_df=df_max[i], stop_words=stopwords.words('english'))
    x = tfidfconverter.fit_transform(documents).toarray()

    # Select training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

    # Train the classification model
    classifier = SGDClassifier()
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    # Evaluate the model 
    print('max_df ' + str(df_max[i]))
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(accuracy_score(y_test, y_pred))

max_df 0.1
[[19 10]
 [ 4 18]]
              precision    recall  f1-score   support

           0       0.83      0.66      0.73        29
           1       0.64      0.82      0.72        22

    accuracy                           0.73        51
   macro avg       0.73      0.74      0.73        51
weighted avg       0.75      0.73      0.73        51

0.7254901960784313
max_df 0.2
[[20  9]
 [ 4 18]]
              precision    recall  f1-score   support

           0       0.83      0.69      0.75        29
           1       0.67      0.82      0.73        22

    accuracy                           0.75        51
   macro avg       0.75      0.75      0.74        51
weighted avg       0.76      0.75      0.75        51

0.7450980392156863
max_df 0.3
[[24  5]
 [ 5 17]]
              precision    recall  f1-score   support

           0       0.83      0.83      0.83        29
           1       0.77      0.77      0.77        22

    accuracy                           0.80        51
