# Example of a Convolutional Neural Network for Text Classification

In [None]:
#Imports needed from pytorch
import torch
from torch.utils.data import Dataset, Subset, DataLoader

#Some built-in imports
import matplotlib.pyplot as plt
import numpy as np
import pickle

#Imports from the repository
import data_processing as dp
from privacy_policies_dataset import PrivacyPoliciesDataset_all as PPD
from cnn import CNN, train_cnn, f1_score, f1_score_per_label

## 1. Preprocessing

We read from raw_data all the files and get all the different words we can find within all the files. Both the train and test folders. If we already have a file named dictionary.pkl and read set to True, it will read the dictionary from this file. 

In [None]:
dictionary = dp.get_tokens("raw_data", read = False)

The next step is to load the pretrained embeddings. We will get two python dictionaries. Both have the words as the keys of the python dictionaries and one has the vectors as the keys whilst the other one has the position on the dictionary.

In [None]:
word2vector_glove, word2idx_glove = dp.get_glove_dicts("glove.6B", 300, read = False)

print("number of words in the pretrained embeddings: {}".format(len(word2idx_glove)))

Now we compute the matrix containing all the word embeddings that we will need for the embedding layer of the CNN and we obtain a word2idx of just all the words inside dictionary and not all the words present in the word embeddings. Usually the pretrained embeddings that we will use have more words than what we need, that is the reason why we need to obtain a new word2idx of just all the words that we found in the files inside train and test folders.

In [None]:
weights_matrix, word2idx = dp.get_weight_matrix(dictionary, word2vector_glove, 300, read = False)

We load the labels that with which we want to perform the classification. We will also show them so that it is clearer to the user.

In [None]:
labels_file = open("labels.pkl","rb")

labels = pickle.load(labels_file)

labels_file.close()

for label in labels:
    
    print(label)

The last step before obtaining the prefectly cleaned data that will be used in the CNN is to aggregate the labels. The dataset provides a files in csv format with repeated sentences. The reason behind this is that some sentences have several labels assigned to them. The last step is to aggregate segments and obtain a list of labels per sentence. The following function gets all the data from raw_data folder and outputs the result in agg_data.

In [None]:
dp.aggregate_data(read = False, onefile = True)

Now that we have the aggregated data in agg_data we will process all the sentences and transform them into a list of integers. The integers will refer to the position inside the word2idx dictionary. The labels will also be transformed into an n-dimensinal vector with 1s if a sentence has that label and 0s if it doesn't. All the data will be placed in the corresponding folder inside processed_data.

In [None]:
sentence_matrices_train, labels_matrices_train = dp.process_dataset("train", labels, word2idx, read = False)

sentence_matrices_test, labels_matrices_test = dp.process_dataset("test", labels, word2idx, read = False)

In [None]:
sentence_matrices_all, labels_matrices_all = dp.process_dataset("all", labels, word2idx, read = True)

## 2. Creation of Datasets and DataLoader

We now create an two objects called PrivacyPoliciesDataset containing the training and testing dataset. We will need to resize the sentences/segments as each of them have different number of words. We will fill them with 0s until we have all the sentences/segments of the same length (index 0 refers to a 0 vector). Now that all the sentences have the same length we can group them in a matrix that will be the input for the embedding layer.

In [None]:
dataset = PPD(sentence_matrices_all, labels_matrices_all, labels)

test_dataset, train_validation_dataset = dataset.split_dataset_randomly()

validation_dataset, train_dataset = train_validation_dataset.split_dataset_randomly()

test_dataset.pickle_dataset("test_dataset.pkl")

train_validation_dataset.pickle_dataset("train_validation_dataset.pkl")

In [None]:
# Procesing train_dataset

print("Preparing train_dataset --------")

train_dataset = PrivacyPoliciesDataset("train" ,"raw_data" , word2idx, labels, read = True)

train_dataset.resize_segments()

train_dataset.expand_dimensions()

train_dataset.group_samples()

# Procesing test_dataset

print("\n" + "Preparing test_dataset  --------")

test_dataset = PrivacyPoliciesDataset("test" ,"raw_data" , word2idx, labels, read = True)

test_dataset.resize_segments()

test_dataset.expand_dimensions()

test_dataset.group_samples()

We now create a DataLoader that allows us to use Mini-Batch Gradient descent and we provide the train_dataset as the input along with the size of the batches that we want to use

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size = 100, shuffle = True)

## 3. Creation of CNN and training

Now we set the 6 main parameters of the CNN we are using:
1. Number of words in the dictionary
2. Embeddings dimension
3. Number of filters per kernel size
4. Number of hidden units
5. Number of labels to classify
6. List of all the kernels sizes we want to use

We will also add the pretrained embeddings to the embedding layer of the CNN through load_pretrained_embeddings. The function called train_cn will need two more parameters:
1. number of epochs 
2. learning rate
3. momentum constant

In [None]:
model_all = CNN(6800, 300, 200, 160, 9, [3])

model_all.load_pretrained_embeddings(weights_matrix)

epochs, losses = train_cnn(model_all, train_dataloader, epochs_num = 100, lr = 0.025, momentum = 0.9)

Now we plot the evolution of the Loss with respect to the epoch.

In [None]:
plt.plot(epochs, losses)

plt.title("loss vs epoch")

plt.show()

We save all the parameters used in the CNN (weights of all the layers and the configurations of the CNN)

In [None]:
torch.save(model_all.state_dict(), model_all.cnn_name + "_state.pt")

model_all.save_cnn_params()

## 4. Evaluation of the CNN results

We extract the labels true labels from the training and testing data sets and predict the labels using both labels. The predictions are usually refered as y_hat. 

In [None]:
y_train = train_dataset.labels_list

y_test = test_dataset.labels_list

y_hat_train = model_all(train_dataset.segments_list)

y_hat_test = model_all(test_dataset.segments_list)

In [None]:
labels

We now show how the F1 score changes for all possible thresholds

In [None]:
def get_best_thresholds(y_test, y_hat_test, labels):

    threshold_list = np.arange(0.0, 1, 0.01)
    
    best_f1_label = [0, 0, 0, 0, 0, 0, 0, 0, 0]

    best_t_label = [0, 0, 0, 0, 0, 0, 0, 0, 0]

    for label, index in labels.items():

        best_f1 = 0

        best_t = 0

        for t in threshold_list:

            current_f1 = f1_score_per_label(y_test, y_hat_test, t)[0][labels[label]].item()

            if current_f1 > best_f1: 

                best_f1 = current_f1

                best_t = t

        best_f1_label[index] = best_f1

        best_t_label[index] = best_t

    return best_f1_label, best_t_label

In [None]:
def model_results(model, train_dataset, test_dataset, labels):

    y_train = train_dataset.labels_list

    y_test = test_dataset.labels_list
    
    y_hat_train = model(train_dataset.segments_list)

    y_hat_test = model(test_dataset.segments_list)
    
    # This will be the x axis
    threshold_list = np.arange(0.0, 1, 0.01)

    # These will be the y axis data
    f1_scores_test = [f1_score(y_test, y_hat_test, t)[0] for t in threshold_list]

    precisions_test = [f1_score(y_test, y_hat_test, t)[1] for t in threshold_list]

    recalls_test = [f1_score(y_test, y_hat_test, t)[2] for t in threshold_list]

    f1_scores_train = [f1_score(y_train, y_hat_train, t)[0] for t in threshold_list]

    precisions_train =[f1_score(y_train, y_hat_train, t)[1] for t in threshold_list]

    recalls_train = [f1_score(y_train, y_hat_train, t)[2] for t in threshold_list]

    """
    Here comes the pyplot code
    """

    fig = plt.figure(figsize=(15,4))

    # We start with the three pyplot axis we want. One for F1, another for precision and one last one for recall
    ax_f1 = fig.add_subplot(131)

    ax_precision = fig.add_subplot(132)

    ax_recall = fig.add_subplot(133)

    # We now plot all the data in te corresponding axis
    ax_f1.plot(threshold_list, f1_scores_test, label='test')

    ax_f1.plot(threshold_list, f1_scores_train, label='train')

    ax_f1.set_title('F1 Score vs Threshold')

    ax_f1.legend()

    ax_precision.plot(threshold_list, precisions_test, label='test')

    ax_precision.plot(threshold_list, precisions_train, label='train')

    ax_precision.set_title('Precision vs Threshold')

    ax_precision.legend()

    ax_recall.plot(threshold_list, recalls_test, label='test')

    ax_recall.plot(threshold_list, recalls_train, label='train')

    ax_recall.set_title('Recall vs Threshold')

    ax_recall.legend()

    plt.show()

    # We show the F1, precision and recall for a threshold of 0.5
    f1, precision, recall = f1_score(y_test, y_hat_test, 0.5)

    print("Scores with 0.5 threshold")
    
    print("-" * 35 * 3)
    
    print("f1        |" + str(f1))

    print("precision |" + str(precision))

    print("recall    |" + str(recall))
    
    print("-" * 35 * 3)
    
    best_f1_label, best_t_label = get_best_thresholds(y_test, y_hat_test, labels)
    
    print("\n" + "F1 Score per Label")
    
    print("-" * 35 * 3)
    
    row_format ="{:<38}" * 3
    
    print(row_format.format("Label", "F1", "Threshold"))
    
    print("-" * 35 * 3)
    
    for label, index in labels.iteritems():
        
        print row_format.format(label, best_f1_label[index], best_t_label[index])

    # We save the figure into a picture
    fig.savefig(fname = model.cnn_name + '.png', format = 'png')

In [None]:
model_results(model_all, train_dataset, test_dataset, labels)

With the following block of code we will find the threshold that with which we obtain the best overall F1 score. 

In [None]:
best_f1_score, best_t_label = get_best_thresholds(y_test, y_hat_test, labels)

Now we show the results for the best combination of thresholds

In [None]:
# We show the F1, precision and recall for the best threshold
f1, precision, recall = f1_score(y_test, y_hat_test, torch.tensor(best_t_label))

print("f1        |" + str(f1))

print("precision |" + str(precision))

print("recall    |" + str(recall))

We can also show how the F1 score changes for all possible thresholds in just one label

In [None]:
threshold_list = np.arange(0.0, 1, 0.01)

label = 'User Choice/Control'

f1_scores_per_label = [f1_score_per_label(y_test, y_hat_test, t)[0][labels[label]].item() for t in threshold_list]

plt.plot(threshold_list, f1_scores_per_label)

plt.title(label + " f1 score" + " vs threshold")

plt.show()

f1_label, precision_label, recall_label = f1_score_per_label(y_test, y_hat_test, 0.07)

f1_label = f1_label[labels[label]].item()

precision_label = precision_label[labels[label]].item()

recall_label = recall_label[labels[label]].item()

print("Label: " + label + "\n")

print("f1_label        |" + str(f1_label))

print("precision_label |" + str(precision_label))

print("recall_label    |" + str(recall_label))

## 5. Comparison between models

In [None]:
params2Ks_file = open('cnn_300_200_100_9_[2]__params.pkl', 'rb')

params2Ks = pickle.load(params2Ks_file)

model_2Ks = CNN(**params2Ks)

model_2Ks.load_state_dict(torch.load('cnn_300_200_100_9_[2]__state.pt'))

model_results(model_2Ks, train_dataset, test_dataset, labels)

In [None]:
params3Ks_file = open('cnn_300_200_100_9_[3]__params.pkl', 'rb')

params3Ks = pickle.load(params3Ks_file)

model_3Ks = CNN(**params3Ks)

model_3Ks.load_state_dict(torch.load('cnn_300_200_100_9_[3]__state.pt'))

model_results(model_3Ks, train_dataset, test_dataset, labels)

In [None]:
params5Ks_file = open('cnn_300_200_100_9_[5]__params.pkl', 'rb')

params5Ks = pickle.load(params5Ks_file)

model_5Ks = CNN(**params5Ks)

model_5Ks.load_state_dict(torch.load('cnn_300_200_100_9_[5]__state.pt'))

model_results(model_5Ks, train_dataset, test_dataset, labels)

In [None]:
params7Ks_file = open('cnn_300_200_100_9_[7]__params.pkl', 'rb')

params7Ks = pickle.load(params7Ks_file)

model_7Ks = CNN(**params7Ks)

model_7Ks.load_state_dict(torch.load('cnn_300_200_100_9_[7]__state.pt'))

model_results(model_7Ks, train_dataset, test_dataset, labels)

### Things to take into consideration

1. It seems that with the Globe pretrained embeddings there are 1000 words that are missing and are initialized as random vectors.