Import libraries and download necessary packages + load dataset for training model

In [2]:
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Download necessary NLTK datasets

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

# Load the data
df = pd.read_excel('privacy2.xlsx')

[nltk_data] Downloading package punkt to /Users/macos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/macos/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/macos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/macos/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Preprocessing dataset

In [3]:
# Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_policy(policy):
    policy = policy.lower()
    policy = re.sub('[%s]' % re.escape(string.punctuation), '', policy)
    policy = re.sub('\w*\d\w*', '', policy)
    tokens = word_tokenize(policy)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return tokens

df['tokens'] = df['text'].apply(preprocess_policy)

Feature Extraction

In [4]:
# Load Google News Word2Vec Model

w2v_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

# Create Vector Representations for Policies
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, data, generate_missing=False):
    embeddings = data['tokens'].apply(lambda x: get_average_word2vec(x, vectors, generate_missing=generate_missing))
    return list(embeddings)

embeddings = get_word2vec_embeddings(w2v_model, df)

print(embeddings, df['type'])

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training model to classify - split train,test dataset

In [5]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(embeddings, df['type'], test_size=0.2, random_state=40)


Random Forest

In [6]:
# Train Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, max_depth=30, min_samples_leaf=2, min_samples_split= 10, random_state=40)
# {'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 50}
clf.fit(X_train, y_train)

# Evaluate the Model
predictions = clf.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))



Accuracy:  0.9481481481481482

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.90      0.94       115
           1       0.93      0.98      0.96       155

    accuracy                           0.95       270
   macro avg       0.95      0.94      0.95       270
weighted avg       0.95      0.95      0.95       270



SVM


In [7]:
# SVM Classifier
svm_clf = SVC(kernel='poly',gamma=1,  probability=True, C=10)
svm_clf.fit(X_train, y_train)
#'C': 10, 'gamma': 1, 'kernel': 'rbf'


svm_predictions = svm_clf.predict(X_test)
print("SVM Accuracy: ", accuracy_score(y_test, svm_predictions))
print("\nSVM Classification Report:\n", classification_report(y_test, svm_predictions))

SVM Accuracy:  0.9333333333333333

SVM Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.92      0.92       115
           1       0.94      0.94      0.94       155

    accuracy                           0.93       270
   macro avg       0.93      0.93      0.93       270
weighted avg       0.93      0.93      0.93       270



Logistic Regression


In [8]:
# Logistic Regression Classifier
logistic_clf = LogisticRegression(solver='saga', max_iter=10000, C=  10, penalty='l2') 
logistic_clf.fit(X_train, y_train)

logistic_predictions = logistic_clf.predict(X_test)
print("Logistic Regression Accuracy: ", accuracy_score(y_test, logistic_predictions))
print("\nLogistic Regression Classification Report:\n", classification_report(y_test, logistic_predictions))


Logistic Regression Accuracy:  0.9185185185185185

Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.86      0.90       115
           1       0.90      0.96      0.93       155

    accuracy                           0.92       270
   macro avg       0.92      0.91      0.92       270
weighted avg       0.92      0.92      0.92       270



CNN


In [9]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPooling1D, Embedding, GlobalMaxPooling1D

# Reshape data for CNN
X_train_cnn = np.array(X_train).reshape(len(X_train), 300, 1)
X_test_cnn = np.array(X_test).reshape(len(X_test), 300, 1)

# Build the CNN model
cnn_model = Sequential()

cnn_model.add(Conv1D(128, 5, activation='relu', input_shape=(300, 1)))
cnn_model.add(MaxPooling1D(5))
cnn_model.add(Conv1D(128, 5, activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dropout(0.5))
cnn_model.add(Dense(1, activation='sigmoid')) # Assuming binary classification

cnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.fit(X_train_cnn, y_train, validation_data=(X_test_cnn, y_test), epochs=5, batch_size=32)

# Evaluate the CNN model
cnn_predictions = (cnn_model.predict(X_test_cnn) > 0.5).astype("int32").flatten()
print("CNN Accuracy: ", accuracy_score(y_test, cnn_predictions))
print("\nCNN Classification Report:\n", classification_report(y_test, cnn_predictions))

Epoch 1/5


2023-10-21 15:14:33.021739: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CNN Accuracy:  0.8666666666666667

CNN Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.76      0.83       115
           1       0.84      0.95      0.89       155

    accuracy                           0.87       270
   macro avg       0.88      0.85      0.86       270
weighted avg       0.87      0.87      0.86       270



Cross validation: 1st time

In [10]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from keras.models import clone_model

def train_and_evaluate_cnn(X_train, y_train, X_val, y_val):
    model = clone_model(cnn_model)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=0)
    _, accuracy = model.evaluate(X_val, y_val, verbose=0)
    return accuracy

# Simulate k-fold cross-validation using 5 splits
cnn_accuracies = []
for _ in range(5):
    X_train_cnn, X_val_cnn, y_train_cnn, y_val_cnn = train_test_split(np.array(embeddings).reshape(len(embeddings), 300, 1), df['type'], test_size=0.2)
    acc = train_and_evaluate_cnn(X_train_cnn, y_train_cnn, X_val_cnn, y_val_cnn)
    cnn_accuracies.append(acc)

rf_scores = cross_val_score(RandomForestClassifier(n_estimators=100, max_depth=30, min_samples_leaf=2, min_samples_split= 10, random_state=40), embeddings, df['type'], cv=5, scoring="accuracy")
print(f"Random Forest Cross-Validation Accuracy: {rf_scores.mean():.2f} (+/- {rf_scores.std() * 2:.2f})\n")

print(f"CNN Cross-Validation Accuracy: {np.mean(cnn_accuracies):.2f} (+/- {np.std(cnn_accuracies) * 2:.2f})")


svm_scores = cross_val_score(SVC(kernel='linear', probability=True), embeddings, df['type'], cv=5, scoring="accuracy")
print(f"SVM Cross-Validation Accuracy: {svm_scores.mean():.2f} (+/- {svm_scores.std() * 2:.2f})\n")

# For Logistic Regression
logistic_scores = cross_val_score(LogisticRegression(max_iter=10000), embeddings, df['type'], cv=5, scoring="accuracy")
print(f"Logistic Regression Cross-Validation Accuracy: {logistic_scores.mean():.2f} (+/- {logistic_scores.std() * 2:.2f})\n")

Random Forest Cross-Validation Accuracy: 0.88 (+/- 0.24)

CNN Cross-Validation Accuracy: 0.83 (+/- 0.06)
SVM Cross-Validation Accuracy: 0.85 (+/- 0.27)

Logistic Regression Cross-Validation Accuracy: 0.85 (+/- 0.26)



Fine-tuning - Random Forest

In [11]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

best_rf = grid_search.best_estimator_
rf_predictions = best_rf.predict(X_test)
print("Random Forest Accuracy after Grid Search: ", accuracy_score(y_test, rf_predictions))


Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best Parameters: {'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Random Forest Accuracy after Grid Search:  0.9555555555555556


Fine-tuning  - SVM

In [12]:

# Define a grid of hyperparameters
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.01, 0.1, 1, 10],
    'kernel': ['rbf', 'linear', 'poly', 'sigmoid']  # You can also add 'linear', 'poly', 'sigmoid' to test other kernels
}

# Use grid search with cross-validation
grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best estimator
best_svm = grid_search.best_estimator_

# Check the best hyperparameters
print(grid_search.best_params_)

# Predict and check accuracy
svm_train_predictions = best_svm.predict(X_train)
svm_train_accuracy = accuracy_score(y_train, svm_train_predictions)
print(f"Train Accuracy after Fine-tuning: {svm_train_accuracy}")

Fitting 5 folds for each of 64 candidates, totalling 320 fits
{'C': 10, 'gamma': 1, 'kernel': 'rbf'}
Train Accuracy after Fine-tuning: 0.9888682745825603


Fine-tuning - Logistic


Main Event - Classification

In [13]:
document_content = """
The use and collection of information by third-party advertising service providers are governed by the relevant third party's Privacy Notice and are not covered by our Privacy Notice. If you would like more information about the information collection practices of a particular service provider, or if you would like more information on how to opt out of a service provider's information collection practices, go to www.aboutads.info. You can opt out of the use of cookies by DoubleClick, one of our service providers, by visiting http://www.google.com/intl/en/policies/privacy. Additionally, many of our advertising service providers are members of the Network Advertising Initiative ('NAI'). You can obtain more information about these advertising service providers' information collection practices, and opt out of such practices (and at the same time opt out of the collection practices of other, or all, NAI members) by following the opt out instructions on the NAI's website at http://www.networkadvertising.org/managing/opt_out.asp.If you would like more information on how to opt out of information collection practices, go to www.aboutads.info.
Information You Post to Blogs, Discussion Forums and Other Community Posting or Social Networking Areas.Please keep in mind that whenever you voluntarily make your personal information or other private information available for viewing by third parties online - for example on blogs, discussion forums, or other community posting or social networking areas of our Site - that information can be seen, collected and used by others besides us. We cannot be responsible for any unauthorized third-party use of such information.
Sharing Within the TEN Corporate Family. We may share your personal information with any parent company, subsidiary, or affiliate in the TEN corporate family. All entities within the TEN corporate family may have access to your personal information as well as anonymous, aggregate demographic data.
Sharing with Affiliates, Partners and Third Party Service Providers. We may provide your information to our affiliates or to third parties, including our third party service providers and contractors, for purposes related to Site administration and other services. For example, if you use a credit or debit card to complete a transaction on our Site, we may share your personal information and credit card number with a credit card processing and/or a fulfillment company in order to complete your transaction, or such service provider(s) may collect that information from you directly, on our behalf.
If you do not wish certain information to be shared for these purposes, send a letter to the Online Privacy Coordinator whose contact information is listed at the end of this Privacy Notice.
We also make some content, products and services available through our Site through cooperative relationships with third-party providers, where the brands of our provider partner appear on the Site in connection with such content, products and/or services. We may share with our provider partner any information you provide, or that is collected, in the course of visiting any pages that are made available in cooperation with our provider partner. In some cases, the provider partner may collect information from you directly, in which cases the Privacy Notice of our provider partner may apply to the provider partner's use of your information. The Privacy Notice of our provider partners may differ from ours. If you have any questions regarding the Privacy Notice of one of our provider partners, you should contact the provider partner directly for more information.
Sharing Information with Third Parties for Advertising and Other Promotional Purposes. We may use third-party service providers to target and serve some of the advertisements you see on the Site. We may share technical or aggregate information, such as type of pages viewed and categories of interest, from our Site with these service providers and advertisers for their use in displaying ads on our Site. These third party providers and advertisers may use their own browser or flash cookies, web beacons and similar technologies to collect technical information (such as device unique identifier, IP address, MAC address, browser type, pages visited, and location information) from users of the Site that is generated automatically as a user views or interacts with an ad. These service providers may use that information, sometimes in conjunction with similar information gathered through other websites and other sources, to deliver advertisements on this Site, and on other websites.
"""
problematic = []


def classify_policy(policy_text):
    # Preprocess the policy
    tokens = preprocess_policy(policy_text)
    embedding = get_average_word2vec(tokens, w2v_model)
    
    # Reshape for CNN
    # cnn_input = np.array(embedding).reshape(1, 300, 1)
    # rf_prediction = clf.predict([embedding])[0]
    # Predict using SVM
    svm_prediction = svm_clf.predict([embedding])[0]

    # Predict using Logistic Regression
    logistic_prediction = logistic_clf.predict([embedding])[0]

    # Predict using CNN
    # cnn_prediction = (cnn_model.predict(cnn_input) > 0.5).astype("int32").flatten()[0]
    
    return logistic_prediction
    # return {
    #     # 'SVM': svm_prediction,
    #     # 'Logistic Regression': logistic_prediction,
    #     "PROBLEMATIC: " + policy_text if cnn_prediction == 0 else policy_text,
    #     'CNN': cnn_prediction
    # }


problematic = []
def split_into_paragraphs(document_content):
    # Split the document by double line breaks, which typically separate paragraphs
    policies = [p for p in document_content.split('\n') if p]
    return policies

paragraphs = split_into_paragraphs(document_content)
for i, paragraph in enumerate(paragraphs, 1):
    print(paragraph)
    print("-----------")
    
for paragraph in paragraphs:
    predictions = classify_policy(paragraph)
    print(predictions)
    if predictions == 0:
        problematic.append(paragraph)

The use and collection of information by third-party advertising service providers are governed by the relevant third party's Privacy Notice and are not covered by our Privacy Notice. If you would like more information about the information collection practices of a particular service provider, or if you would like more information on how to opt out of a service provider's information collection practices, go to www.aboutads.info. You can opt out of the use of cookies by DoubleClick, one of our service providers, by visiting http://www.google.com/intl/en/policies/privacy. Additionally, many of our advertising service providers are members of the Network Advertising Initiative ('NAI'). You can obtain more information about these advertising service providers' information collection practices, and opt out of such practices (and at the same time opt out of the collection practices of other, or all, NAI members) by following the opt out instructions on the NAI's website at http://www.netwo

Main Event - Highlight

In [14]:
from IPython.core.display import display, HTML

def highlight_problematic_sentences(term, problematic_phrases):
    if len(problematic_phrases) >= 1:
        for phrase in problematic_phrases:
            highlighted_phrase = f'<span style="background-color: #ff0000">{phrase}</span>'
            term = term.replace(phrase, highlighted_phrase)
        display(HTML(term))

highlight_problematic_sentences(document_content, problematic)

  from IPython.core.display import display, HTML


Main Event - Summarization

In [15]:
from transformers import pipeline
bart = pipeline("summarization", model="facebook/bart-large-cnn")
articles = problematic
summ = []

for article in articles:
  result = bart(article)
  summ.append(result)

summ

  from .autonotebook import tqdm as notebook_tqdm
Your max_length is set to 142, but you input_length is only 56. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)


[[{'summary_text': 'All entities within the TEN corporate family may have access to your personal information as well as anonymous, aggregate demographic data. We may share your personal info with any parent company, subsidiary, or affiliate in the Tten corporate family. We are not responsible for the content of any emails you send or receive.'}],
 [{'summary_text': 'We may use third-party service providers to target and serve some of the advertisements you see on the Site. We may share technical or aggregate information, such as type of pages viewed and categories of interest, from our Site with these service providers and advertisers for their use in displaying ads on our Site.'}]]

In [16]:
import sys
print(sys.version)

3.10.9 (main, Mar  1 2023, 12:20:14) [Clang 14.0.6 ]
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   1.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.8s
[CV] END max_depth=None, 

[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   1.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   1.6s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimato

[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   1.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   1.6s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   1.5s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=   1.5s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   0.7s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   1.5s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_est