In [None]:
import pandas as pd
import torch
import torch.optim  
import torch.nn as nn
import torch.nn.functional as F
from math import log, log10
from torch.utils.data import Dataset
import data_processing as dp
import pickle
from privacy_policies_dataset import PrivacyPoliciesDataset
from os.path import isfile, join
from os import listdir
import numpy as np
import nltk

In [None]:
dictionary = dp.get_tokens("raw_data",True)
word2vector, word2idx_glove = dp.get_glove_dicts("glove.6B", 50, True)
weights_matrix, word2idx = dp.get_weight_matrix(dictionary, word2vector, 50, True)

In [None]:
class CNN(nn.Module):
    
    def __init__(self, weights_matrix, Co, C, Ks):
        
        super(CNN, self).__init__()
        
        num_embeddings, embeddings_dim = weights_matrix.shape
        
        self.Co = Co
        
        self.C = C
        
        self.Ks = Ks
        
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(weights_matrix).float())       
                       
        self.convolutions = nn.ModuleList([nn.Conv2d(1,self.Co,(k, embeddings_dim)) for k in Ks])
            
        self.relu = nn.ReLU()
            
        #self.max_pool = torch.max
        
        self.linear = nn.Linear(self.Co * len(self.Ks), self.C)
        
        self.sigmoid = nn.Sigmoid()
    
    def forward(self,x):
        
        x = self.embedding(x)
        
        x = [self.relu(conv(x)).squeeze(3) for conv in self.convolutions]
        
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        
        x = torch.cat(x,1)
        
        x = self.linear(x)
        
        x = self.sigmoid(x)
        
        return x        

In [None]:
model = CNN(weights_matrix, 6, 10, [3,5,7])

In [None]:
model = torch.load("first_model.model")

In [None]:
dict_p = {i: p for i, p in enumerate(params())}

In [None]:
labels_file = open("labels.pkl","rb")

labels = pickle.load(labels_file)

labels_file.close()

In [None]:
def resize_input(segment, target_length):
    
    zeros_to_prepend = (target_length - len(segment))/2

    zeros_to_append = target_length - len(segment) - zeros_to_prepend

    resized_array = np.append(np.zeros(zeros_to_prepend), segment)

    resized_array = np.append(resized_array, np.zeros(zeros_to_append))

    return torch.tensor(resized_array, dtype = torch.int64)

In [None]:
def vector_to_labels(vector, labels, threshold = 0.5): 
    """
    
    Returns a vector representing the label passed as an input.
    
    Args:
        label: string, label that we want to transform into a vector.
        labels: dictionary, dictionary with the labels as the keys and indexes as the values.
    Returns:
        vector: np.array, 1-D array of lenght 10.
        
    """
    
    vector = vector.squeeze(0)
    
    tests = vector > threshold
    
    candidates = [key for i, test in enumerate(tests) if test for key, value in labels.items() if value == i]
    
    return candidates

In [None]:
def guess_labels(model, segment, word2idx, labels, threshold = 0.5):

    segment = dp.sentence_serialization(segment,word2idx)

    segment = resize_input(segment, 425).unsqueeze(0).unsqueeze(0)

    prediction = model(segment)

    candidates = vector_to_labels(prediction, labels, threshold)

    for i, candidate in enumerate(candidates,1):

        print(str(i) + ". " + candidate)

In [None]:
segment1 = "the data will be erase after usage"

segment2 = "you can't recover your data"

segment3 = "we will share all your data"

print("First guess: ")

guess_labels(model, segment1, word2idx, labels, 0.47)

print("\n")

print("Second guess: ")

guess_labels(model, segment2, word2idx, labels, 0.47)

print("\n")

print("Third guess: ")

guess_labels(model, segment3, word2idx, labels, 0.4)