# Example of a Convolutional Neural Network for Text Classification

In [None]:
#Imports needed from pytorch
import torch
from torch.utils.data import Dataset

#Some built-in imports
import matplotlib.pyplot as plt
import numpy as np
import pickle
from os.path import join

#Imports from the repository
import data_processing as dp
from privacy_policies_dataset import PrivacyPoliciesDataset_all as PPD
from cnn import CNN

## 1. Generating word embeddings matrix

We read from raw_data all the files and get all the different words we can find within all the files. If we already have a file named dictionary.pkl and read set to True, it will read the dictionary from this file. 

In [None]:
dictionary = dp.get_tokens("raw_data", "embeddings_data", read = True)

The next step is to load the pretrained embeddings. We will get two python dictionaries. Both have the words as the keys of the python dictionaries and one has the vectors as the keys whilst the other one has the position on the dictionary.

In [None]:
word2vector_glove = dp.get_glove_dicts("glove.6B", "embeddings_data", 300, read = True)

print("Number of words in the pretrained embeddings: {}".format(len(word2vector_glove)))

Now we compute the matrix containing all the word embeddings that we will need for the embedding layer of the CNN and we obtain a word2idx of just all the words inside dictionary and not all the words present in the word embeddings. Usually the pretrained embeddings that we will use have more words than what we need, that is the reason why we need to obtain a new word2idx of just all the words that we found in the files inside train and test folders.

In [None]:
weights_matrix, word2idx = dp.get_weight_matrix(dictionary, word2vector_glove, 300, "embeddings_data", read = True)

## 2. Creation of Datasets

The first step before obtaining the prefectly cleaned data that will be used in the CNN is to aggregate the labels. The raw_data folder provides a series of files in csv format with repeated sentences. The reason behind this is that some sentences have several labels assigned to them. The last step is to aggregate segments and obtain a list of labels per sentence. The following function gets all the data from raw_data folder and outputs the result in agg_data.

In [None]:
dp.aggregate_data(read = True)

Now that we have the aggregated data in agg_data we will process all the sentences and transform them into a list of integers. The integers will refer to the position inside the word2idx dictionary. The labels will also be transformed into an n-dimensinal vector with 1s if a sentence has that label and 0s if it doesn't. All the data will be placed in the corresponding folder inside processed_data. We load the labels with which we want to perform the classification. We will also show them so that it is clearer to the user.

In [None]:
labels_file = open("labels.pkl","rb")

labels = pickle.load(labels_file)

labels_file.close()

for label in labels:
    
    print(label)

In [None]:
sentence_matrices_all, labels_matrices_all = dp.process_dataset(labels, word2idx, read = True)

We now create an PPD which stands for PrivacyPoliciesDataset containing the training and testing dataset. We will need to split the data in two to get the test training data and the data that will be used for training and validation.

In [None]:
dataset = PPD(sentence_matrices_all, labels_matrices_all, labels)

test_dataset, train_validation_dataset = dataset.split_dataset_randomly()

validation_dataset, train_dataset = train_validation_dataset.split_dataset_randomly()

test_dataset.pickle_dataset("datasets/test_dataset.pkl")

train_validation_dataset.pickle_dataset("datasets/train_validation_dataset.pkl")

In case we aready had all the data split and prepared we can load it like this: 

In [None]:
test_dataset = PPD.unpickle_dataset("datasets/test_dataset.pkl")

train_validation_dataset = PPD.unpickle_dataset("datasets/train_validation_dataset.pkl")

## 3. Creation of CNN and training

Now we set the 6 main parameters of the CNN we are using:
1. Number of words in the dictionary
2. Embeddings dimension
3. Number of filters per kernel size
4. Number of hidden units
5. Number of labels to classify
6. List of all the kernels sizes we want to use
7. Name of the model

In [None]:
model = CNN(6800, 300, 200, 100, 9, [3], name='bis')

model.load_pretrained_embeddings(weights_matrix)

We will also add the pretrained embeddings to the embedding layer of the CNN through load_pretrained_embeddings. The function called train_cn will need two more parameters:
1. number of epochs 
2. learning rate
3. momentum constant

In [None]:
results = model.train_cnn(train_validation_dataset, epochs_num = 50, lr = 0.01, momentum = 0.9)

epochs, train_losses, validation_losses = results

Now we plot the evolution of the Loss with respect to the epoch.

In [None]:
plt.plot(epochs, train_losses, label = "train")

plt.plot(epochs, validation_losses, label = "validation")

plt.legend()

plt.title("loss vs epoch")

plt.show()

We save all the parameters used in the CNN (weights of all the layers and the configurations of the CNN)

In [None]:
dict_path = join("trained_models", model.cnn_name + "_state.pt")

torch.save(model.state_dict(), dict_path)

model.save_cnn_params()

## 4. Evaluation of the CNN results

We extract the labels true labels from the training and testing data sets and predict the labels using both labels. The predictions are usually refered as y_hat. 

In [None]:
y_train = train_validation_dataset.labels_tensor

y_test = test_dataset.labels_tensor

x_train = PPD.collate_data(train_validation_dataset)[0]

x_test = PPD.collate_data(test_dataset)[0]

y_hat_train = model(x_train)

y_hat_test = model(x_test)

We now show how the F1 score changes for all possible thresholds

In [None]:
model.print_results(train_validation_dataset, test_dataset, labels)

With the following block of code we will find the threshold that with which we obtain the best overall F1 score. 

In [None]:
best_f1_score, best_t_label = CNN.get_best_thresholds(y_train, y_hat_train, labels)

Now we show the results for the best combination of thresholds

In [None]:
# We show the F1, precision and recall for the best threshold
f1, precision, recall = CNN.f1_score(y_test, y_hat_test, torch.tensor(best_t_label))

print("f1        |" + str(f1))

print("precision |" + str(precision))

print("recall    |" + str(recall))

We show a list of all the possible labels to remind the user which ones are available.

In [None]:
for label, i in labels.iteritems():
    
    print("index {}. {}.".format(i, label))

We can also show how the F1 score changes for all possible thresholds in just one label

In [None]:
threshold_list = np.arange(0.0, 1, 0.01)

label = 'User Choice/Control'

f1_scores_per_label = [CNN.f1_score_per_label(y_test, y_hat_test, t)[0][labels[label]].item() for t in threshold_list]

plt.plot(threshold_list, f1_scores_per_label)

plt.title(label + " f1 score" + " vs threshold")

plt.show()

f1_label, precision_label, recall_label = CNN.f1_score_per_label(y_test, y_hat_test, 0.5)

f1_label = f1_label[labels[label]].item()

precision_label = precision_label[labels[label]].item()

recall_label = recall_label[labels[label]].item()

print("Label: " + label + "\n")

print("f1_label        |" + str(f1_label))

print("precision_label |" + str(precision_label))

print("recall_label    |" + str(recall_label))

## 5. Comparison between models

In [None]:
params2Ks_file = open('trained_models/cnn_300_200_100_9_[2]__params.pkl', 'rb')

params2Ks = pickle.load(params2Ks_file)

model_2Ks = CNN(**params2Ks)

model_2Ks.load_state_dict(torch.load('trained_models/cnn_300_200_100_9_[2]__state.pt'))

model_2Ks.print_results(train_validation_dataset, test_dataset, labels)

In [None]:
params3Ks_file = open('trained_models/cnn_300_200_100_9_[3]__params.pkl', 'rb')

params3Ks = pickle.load(params3Ks_file)

model_3Ks = CNN(**params3Ks)

model_3Ks.load_state_dict(torch.load('trained_models/cnn_300_200_100_9_[3]__state.pt'))

model_3Ks.print_results(train_validation_dataset, test_dataset, labels)

In [None]:
params5Ks_file = open('trained_models/cnn_300_200_100_9_[5]__params.pkl', 'rb')

params5Ks = pickle.load(params5Ks_file)

model_5Ks = CNN(**params5Ks)

model_5Ks.load_state_dict(torch.load('trained_models/cnn_300_200_100_9_[5]__state.pt'))

model_5Ks.print_results(train_validation_dataset, test_dataset, labels)

In [None]:
params7Ks_file = open('trained_models/cnn_300_200_100_9_[7]__params.pkl', 'rb')

params7Ks = pickle.load(params7Ks_file)

model_7Ks = CNN(**params7Ks)

model_7Ks.load_state_dict(torch.load('trained_models/cnn_300_200_100_9_[7]__state.pt'))

model_7Ks.print_results(train_validation_dataset, test_dataset, labels)