In [1]:
import pandas as pd
import torch
import matplotlib.pyplot as plt
from math import log, log10
from torch.utils.data import Dataset, DataLoader
import data_processing as dp
import pickle
from privacy_policies_dataset import PrivacyPoliciesDataset
from os.path import isfile, join
from os import listdir
import numpy as np
from cnn import CNN, train_cnn

In [2]:
dictionary = dp.get_tokens("raw_data", read = True)

Loading from file dictionary.pkl


In [3]:
word2vector, word2idx_glove = dp.get_glove_dicts("glove.6B", 100, read = True)

Loading from files word2vector.pkl and word2idx.pkl


In [4]:
len(word2idx_glove)

400001

In [5]:
def get_absent_words(dictionary, word2vector):

    absent_words = []

    for word in dictionary:

        try:

            word2vector[word]

        except KeyError:

            absent_words.append(word)
            
    return absent_words

In [6]:
weights_matrix, word2idx = dp.get_weight_matrix(dictionary, word2vector, 100, read = True)

Loading from file weights_matrix.pkl


In [7]:
labels_file = open("labels.pkl","rb")

labels = pickle.load(labels_file)

labels_file.close()

In [8]:
dp.aggregate_data(read = False)

Processing dataset ...


In [9]:
sentence_matrices_train, labels_matrices_train = dp.process_dataset("train", labels, word2idx, read = True)

Loading from processed_data/


In [10]:
sentence_matrices_test, labels_matrices_test = dp.process_dataset("test", labels, word2idx, read = True)

Loading from processed_data/


In [11]:
train_dataset = PrivacyPoliciesDataset("train" ,"raw_data", word2idx, labels, read = True)

train_dataset.resize_segments()

train_dataset.expand_dimensions()

train_dataset.group_samples()

Loading from processed_data/
Resizing segments (filling with zeros). Target size: 425
Grouping samples into one Tensor


In [12]:
train_dataset.segments_list.size()

torch.Size([2851, 1, 425])

In [13]:
train_dataset.labels_list.size()

torch.Size([2851, 9])

In [14]:
test_dataset = PrivacyPoliciesDataset("test" ,"raw_data", word2idx, labels, read = True)

test_dataset.resize_segments()

test_dataset.expand_dimensions()

test_dataset.group_samples()


Loading from processed_data/
Resizing segments (filling with zeros). Target size: 387
Grouping samples into one Tensor


In [15]:
test_dataset.segments_list.size()

torch.Size([875, 1, 387])

In [16]:
test_dataset.labels_list.size()

torch.Size([875, 9])

In [17]:
train_dataloader = DataLoader(train_dataset, batch_size = 100)

In [24]:
model_all = CNN(6800, 100, 12, 22, 9, [3,5,7])

In [25]:
model_all.load_embeddings(weights_matrix)

In [26]:
model_all.embedding.weight

Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0425,  0.3873, -0.6664,  ..., -1.4347, -0.2803,  0.5186],
        [ 0.0515, -0.1108, -0.1030,  ...,  1.1112,  0.0970,  0.5142],
        ...,
        [-0.1834,  0.4438, -0.0218,  ...,  0.0879, -0.1629,  1.1347],
        [-0.3543,  0.2249, -0.2998,  ..., -0.3188,  0.5824, -0.6919],
        [ 0.3140, -0.4960, -0.3450,  ...,  0.3828,  0.3620,  0.4923]])

In [27]:
epochs, losses = train_cnn(model_all, train_dataloader, epochs_num = 2, lr = 0.05)

last epoch finished: 1 -- progress: 100% -- time: 0.383099438015 mins
Training completed. Total training time: 0.3 mins


In [None]:
plt.plot(epochs, losses)

plt.title("loss vs epoch")

plt.show()

In [None]:
torch.save(model_all.state_dict(),"model_all.pt")

In [None]:
model_all.save_cnn_params()

In [None]:
y_train = train_dataset.labels_list

y_test = test_dataset.labels_list

y_hat_train = model_all(train_dataset.segments_list)

y_hat_test = model_all(test_dataset.segments_list)

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
thresholds = torch.tensor([0.08, 0.005, 0.33, 0.45, 0.5, 0.24, 0.02, 0.64, 0.01])

In [None]:
def f1_score(y_true, y_pred, threshold, dim = 0, eps = 1e-9):

    y_pred = torch.ge(y_pred.float(), threshold).float()
    
    y_true = y_true.float()

    true_positive = (y_pred * y_true).sum(dim = dim)
    
    precision = true_positive.div(y_pred.sum(dim = dim).add(eps))
    
    recall = true_positive.div(y_true.sum(dim = dim).add(eps))
    
    f1 = torch.mean((precision * recall).div(precision + recall + eps) * 2)

    return f1.item(), torch.mean(precision).item(), torch.mean(recall).item()

In [None]:
f1_score(y_test, y_hat_test, thresholds)

In [None]:
def f1_score_per_label(y_true, y_pred, threshold, dim=0, eps=1e-9):

    y_pred = torch.ge(y_pred.float(), threshold).float()
    
    y_true = y_true.float()

    true_positive = (y_pred * y_true).sum(dim=dim)
    
    precision = true_positive.div(y_pred.sum(dim=dim).add(eps))
    
    recall = true_positive.div(y_true.sum(dim=dim).add(eps))
    
    f1 = (precision * recall).div(precision + recall + eps) * 2

    return f1, precision, recall

In [None]:
def compute_accuracy(Y, Y_hat, test = 'ALO', **kwargs):

    def at_least_one(y, y_hat, threshold = 0.5):
        
        y_hat = y_hat > threshold
    
        return any([y_i.item() and y_hat_i.item() for y_i, y_hat_i in zip(y, y_hat)])
    
    def most_probable_label(y, y_hat):

        i_ = [i for i, a in enumerate(y) if a == max(y)]

        [j_] = [j for j, b in enumerate(y_hat) if b == max(y_hat)]
        
        return j_ in i_
    
    num_samples = float(Y.shape[0])
    
    iterations = 0
    
    if test == 'ALO':
        
        parameters = dict(kwargs)
        
        try:
            
            positives = sum([at_least_one(y, y_hat, parameters['threshold']) for y, y_hat in zip(Y, Y_hat)])
            
        except:
            
            positives = sum([at_least_one(y, y_hat) for y, y_hat in zip(Y, Y_hat)])      
        
    elif test == 'MPL':
        
        positives = sum([most_probable_label(y, y_hat) for y, y_hat in zip(Y, Y_hat)])
        
    else:
        
        print("not a valid test name ...")
        
        positives = 0
    
    return positives / num_samples

In [None]:
threshold_list = np.arange(0.0, 1, 0.01)

f1_scores = [f1_score(y_test, y_hat_test, t)[0] for t in threshold_list]

plt.plot(threshold_list, f1_scores)

plt.show()

In [None]:
f1, precision, recall = f1_score(y_hat_test, y_hat_test, 0.25)

print("f1        |" + str(f1))

print("precision |" + str(precision))

print("recall    |" + str(recall))

In [None]:
labels

In [None]:
threshold_list = np.arange(0.0, 1, 0.01)

label = 'User Choice/Control'

f1_scores_per_label = [f1_score_per_label(y_test, y_hat_test, t)[0][labels[label]].item() for t in threshold_list]

plt.plot(threshold_list, f1_scores_per_label)

plt.show()

In [None]:
label

In [None]:
f1_score_per_label(y_test, y_hat_test, 0.01)[0][labels[label]].item()

In [None]:
torch.save(model, "second_model.model")

### Things to take into consideration

1. It seems that with teh GloVe pretrained embeddings there are 1000 words that are missing and are initialized as random vectors.
2. Here we can see a very strange behaviour. We are expecting to have all 0s except for the last entry in which we are expecting Ln(0.9) and it is not even close to it. It seems they are not computing the BCE exactly as we think.