In [None]:
import pandas as pd
import torch
import torch.optim 
import torch.nn as nn
import torch.nn.functional as F
from math import log, log10
from torch.utils.data import Dataset, DataLoader
import data_processing as dp
import pickle
from privacy_policies_dataset import PrivacyPoliciesDataset
from os.path import isfile, join
from os import listdir
import numpy as np

In [None]:
dictionary = dp.get_tokens("raw_data",False)

In [None]:
word2vector, word2idx_glove = dp.get_glove_dicts("glove.6B", 50, True)

In [None]:
weights_matrix, word2idx = dp.get_weight_matrix(dictionary, word2vector, 50, True)

In [None]:
labels_file = open("labels.pkl","rb")

labels = pickle.load(labels_file)

labels_file.close()

In [None]:
sentence_matrices, labels_matrices = dp.process_dataset(labels, word2idx, True)

In [None]:
dp.aggregate_data()

In [None]:
dataset = PrivacyPoliciesDataset("raw_data", word2idx, labels)

In [None]:
dataset.resize_segments()

In [None]:
dataset.segments_list[0].size()

In [None]:
dataset.expand_dimensions()

In [None]:
dataset.group_samples()

In [None]:
dataset.segments_list.size()

In [None]:
dataset.labels_list.size()

In [None]:
myemb = nn.Embedding.from_pretrained(torch.tensor(weights_matrix))

myemb.weight.requires_grad

In [None]:
matrix = myemb(dataset.segments_list)

print("Before reshaping: " + str(matrix.shape))

# matrix = matrix.view(1,1,67,50)

# print("After reshaping: " + str(matrix.shape))

In [None]:
class CNN(nn.Module):
    
    def __init__(self, weights_matrix, Co, C, Ks):
        
        super(CNN, self).__init__()
        
        num_embeddings, embeddings_dim = weights_matrix.shape
        
        self.Co = Co
        
        self.C = C
        
        self.Ks = Ks
        
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(weights_matrix).float())       
                       
        self.convolutions = nn.ModuleList([nn.Conv2d(1,self.Co,(k, embeddings_dim)) for k in Ks])
            
        self.relu = nn.ReLU()
            
        #self.max_pool = torch.max
        
        self.linear = nn.Linear(self.Co * len(self.Ks), self.C)
        
        self.sigmoid = nn.Sigmoid()
    
    def forward(self,x):
        
        x = self.embedding(x)
        
        x = [self.relu(conv(x)).squeeze(3) for conv in self.convolutions]
        
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        
        x = torch.cat(x,1)
        
        x = self.linear(x)
        
        x = self.sigmoid(x)
        
        return x        

In [None]:
model = CNN(weights_matrix, 6, 10, [3,5,7])

In [None]:
input = dataset.segments_list
g
target = dataset.labels_list.float()

optimizer = torch.optim.SGD(model.parameters(), lr= 0.01)

criterion = nn.BCELoss()

for i in range(10):
    
    model.zero_grad()
    
    output = model(input)
    
    loss = criterion(output, target)
    
    if i % 10 == 0:
        
        print("loss: " + str(loss))
        
    loss.backward()
    
    optimizer.step()

In [None]:
list(model.parameters())[2].grad

In [None]:
torch.save(model, "first_model")

### Things to take into consideration

1. It seems that with teh GloVe pretrained embeddings there are 1000 words that are missing and are initialized as random vectors.
2. Here we can see a very strange behaviour. We are expecting to have all 0s except for the last entry in which we are expecting Ln(0.9) and it is not even close to it. It seems they are not computing the BCE exactly as we think.