In [1]:
import re
import numpy as np
import torch as th
import torch.autograd as ag
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import pandas as pd

import framework as fk
import metrics as M
import losses as L

import gc

In [2]:
# Tokenize a sentence
def clean_str(string, tolower=True):
    """
    Tokenization/string cleaning.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    if tolower:
        string = string.lower()
    return string.strip()

## Creating our model

In [3]:
class LSTM_classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, hidden_dim2):
        super(LSTM_classifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.linear = nn.Linear(2*hidden_dim, hidden_dim2)
        self.linear2 = nn.Linear(hidden_dim2, 6)
        self.bn1 = nn.BatchNorm1d(num_features=hidden_dim2)
        self.dropout = nn.Dropout(0.2)
    
    def last_timestep(self, unpacked, lengths):
        # Index of the last output for each sequence.
        idx = (lengths - 1).view(-1, 1).expand(unpacked.size(0),
                                               unpacked.size(2)).unsqueeze(1)
        return unpacked.gather(1, idx).squeeze()

    def forward(self, inputs):
        batch_size, seq_len = inputs.size()
        a = []
        for i in inputs:
            a += [max(np.where(th.LongTensor(i).numpy()==identifiers["<PAD>"])[0][0],1)]
        input_lengths = th.LongTensor(a)
        out = self.embedding(inputs)
        out = th.nn.utils.rnn.pack_padded_sequence(out, input_lengths, batch_first=True, enforce_sorted=False)
        out, (h0,h1) = self.lstm(out)
        out, _ = th.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        out = self.last_timestep(out, input_lengths)
        # out = self.bn1(th.relu(self.linear(self.dropout(out))))
        out = th.relu(self.linear(self.dropout(out)))
        out = self.linear2(out)
        
        return out
    
    def make_input(self, sentences):
        X_lengths = [len(sentence) for sentence in sentences]
        
        padding_token = identifiers['<PAD>']
        
        longest_sent = max(X_lengths)
        batch_size = len(sentences)
        padded_X = np.ones((batch_size, longest_sent)) * padding_token
        
        # copy over the actual sequences
        for i, x_len in enumerate(X_lengths):
            sequence = sentences[i]
            padded_X[i, 0:len(sequence)] = sequence
    
        return th.from_numpy(padded_X).long(), th.from_numpy(np.array([len(sentence) for sentence in sentences])).long()

In [None]:
class LSTM_classifier_single(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, hidden_dim2):
        super(LSTM_classifier_single, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.linear = nn.Linear(2*hidden_dim, hidden_dim2)
        self.linear2 = nn.Linear(hidden_dim2, 2)
        self.bn1 = nn.BatchNorm1d(num_features=hidden_dim2)
        self.dropout = nn.Dropout(0.2)
    
    def last_timestep(self, unpacked, lengths):
        # Index of the last output for each sequence.
        idx = (lengths - 1).view(-1, 1).expand(unpacked.size(0),
                                               unpacked.size(2)).unsqueeze(1)
        return unpacked.gather(1, idx).squeeze()

    def forward(self, inputs):
        batch_size, seq_len = inputs.size()
        a = []
        for i in inputs:
            a += [max(np.where(th.LongTensor(i).numpy()==identifiers["<PAD>"])[0][0],1)]
        input_lengths = th.LongTensor(a)
        out = self.embedding(inputs)
        out = th.nn.utils.rnn.pack_padded_sequence(out, input_lengths, batch_first=True, enforce_sorted=False)
        out, (h0,h1) = self.lstm(out)
        out, _ = th.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        out = self.last_timestep(out, input_lengths)
        # out = self.bn1(th.relu(self.linear(self.dropout(out))))
        out = th.relu(self.linear(self.dropout(out)))
        out = self.linear2(out)
        
        return out
    
    def make_input(self, sentences):
        X_lengths = [len(sentence) for sentence in sentences]
        
        padding_token = identifiers['<PAD>']
        
        longest_sent = max(X_lengths)
        batch_size = len(sentences)
        padded_X = np.ones((batch_size, longest_sent)) * padding_token
        
        # copy over the actual sequences
        for i, x_len in enumerate(X_lengths):
            sequence = sentences[i]
            padded_X[i, 0:len(sequence)] = sequence
    
        return th.from_numpy(padded_X).long(), th.from_numpy(np.array([len(sentence) for sentence in sentences])).long()

In [None]:
class LSTM_classifier_1LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, hidden_dim2):
        super(LSTM_classifier_1LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.class0 = nn.Sequential(nn.Dropout(0.2), nn.Linear(2*hidden_dim, hidden_dim2), nn.Linear(hidden_dim2, 2))
        self.class1 = nn.Sequential(nn.Dropout(0.2), nn.Linear(2*hidden_dim, hidden_dim2), nn.Linear(hidden_dim2, 2))
        self.class2 = nn.Sequential(nn.Dropout(0.2), nn.Linear(2*hidden_dim, hidden_dim2), nn.Linear(hidden_dim2, 2))
        self.class3 = nn.Sequential(nn.Dropout(0.2), nn.Linear(2*hidden_dim, hidden_dim2), nn.Linear(hidden_dim2, 2))
        self.class4 = nn.Sequential(nn.Dropout(0.2), nn.Linear(2*hidden_dim, hidden_dim2), nn.Linear(hidden_dim2, 2))
        self.class5 = nn.Sequential(nn.Dropout(0.2), nn.Linear(2*hidden_dim, hidden_dim2), nn.Linear(hidden_dim2, 2))
    
    def last_timestep(self, unpacked, lengths):
        # Index of the last output for each sequence.
        idx = (lengths - 1).view(-1, 1).expand(unpacked.size(0),
                                               unpacked.size(2)).unsqueeze(1)
        return unpacked.gather(1, idx).squeeze()

    def forward(self, inputs):
        batch_size, seq_len = inputs.size()
        a = []
        for i in inputs:
            a += [max(np.where(th.LongTensor(i).numpy()==identifiers["<PAD>"])[0][0],1)]
        input_lengths = th.LongTensor(a)
        out = self.embedding(inputs)
        out = th.nn.utils.rnn.pack_padded_sequence(out, input_lengths, batch_first=True, enforce_sorted=False)
        print(out)
        out, (h0,h1) = self.lstm(out)
        print(out)
        out, _ = th.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        print(out)
        out = self.last_timestep(out, input_lengths)
        
        mult_out = [model(out) for model in [self.class0, self.class1, self.class2, self.class3, self.class4, self.class5]]
        
        return mult_out
    
    def make_input(self, sentences):
        X_lengths = [len(sentence) for sentence in sentences]
        
        padding_token = identifiers['<PAD>']
        
        longest_sent = max(X_lengths)
        batch_size = len(sentences)
        padded_X = np.ones((batch_size, longest_sent)) * padding_token
        
        # copy over the actual sequences
        for i, x_len in enumerate(X_lengths):
            sequence = sentences[i]
            padded_X[i, 0:len(sequence)] = sequence
    
        return th.from_numpy(padded_X).long(), th.from_numpy(np.array([len(sentence) for sentence in sentences])).long()

## Running our computations
We load our data, create our dataset and our model and train it

In [None]:
# This function loads the data and returns the values for the label we specify
# -1 means all the labels are returned
def load_data(label=-1):
    # We load the train dataset
    train_data_raw = pd.read_csv('data/train.csv.zip', compression='zip').values.tolist()
    train_data = [clean_str(i[1]).split(" ") for i in train_data_raw]
    train_labels = [[float(j) for j in i[2:]] for i in train_data_raw]
    
    # We load the test dataset and keep only the examples with valid labels
    test_labels_raw = pd.read_csv('data/test_labels.csv.zip', compression='zip').values.tolist()
    test_labels = [[float(j) for j in i[1:]] for i in test_labels_raw]
    indices = [i for i in range(len(test_labels)) if test_labels[i][0] != -1]
    test_labels = [test_labels[i] for i in indices]
    test_data_raw = pd.read_csv('data/test.csv.zip', compression='zip').values.tolist()
    test_data = [clean_str(i[1]).split(" ") for i in test_data_raw]
    test_data = [test_data[i] for i in indices]
    
    # We build our train, dev and test dataset and return them
    if label==-1:
        train_dataset = (train_data[:100000], train_labels[:100000])
        dev_dataset = (train_data[152000:153000], train_labels[152000:153000])
        test_dataset = (test_data[:2000], test_labels[:2000])
    else:
        train_dataset = (train_data[:100000], [[i[label]] for i in train_labels[:100000]])
        dev_dataset = (train_data[152000:153000], [[i[label]] for i in train_labels[152000:153000]])
        test_dataset = (test_data[:2000], [[i[label]] for i in test_labels[:2000]])    
    return train_dataset, dev_dataset, test_dataset

train_dataset, dev_dataset, test_dataset = load_data()
# For some losses, we can weight differently the classes
# This gives us the weight we should give to the positive class for each label
for j in range(len(train_dataset[1][0])):
    weight = sum(i[j] for i in train_dataset[1])
    print("Weight of label",j,"is",(100000-weight)/weight)

### Creating our dictionnary

In [None]:
from keras.preprocessing import text

# If tokenize is true, we use a tokenizer to normalize the words
# Otherwise, we leave them as they are
def build_dictionary(train_data, tokenize=False):
    if tokenize:
        tokenizer = text.Tokenizer()
        tokenizer.fit_on_texts(list(train_data))
        tokenizer.word_index["<PAD>"] = len(tokenizer.word_index.keys())
        padding_idx = tokenizer.word_index["<PAD>"]
        identifiers = tokenizer.word_index
    else : 
        identifiers = {}
        for comment in train_data:
            for word in comment:
                if word not in identifiers.keys():
                    identifiers[word] = len(identifiers.keys())
        identifiers["<PAD>"] = len(identifiers.keys())
        padding_idx = identifiers["<PAD>"]
    return identifiers, padding_idx
        
identifiers, padding_idx = build_dictionary(train_dataset[0], tokenize=False)

### Transforming our dataset
We replace the words by their identifiers if they exist and add padding so that all the sentences in a given dataset have the same size

In [None]:
# This function replaces each word by its index in the dictionary if it exists
# If not, the word is deleted
# Then, we pad each example so that they all have the same size of at least 1
def transform_data(dataset):
    tmp = [[identifiers[j] for j in i if j in identifiers.keys()] for i in dataset[0]]
    m = max(len(i) for i in tmp)
    tmp = [i+(m-len(i)+1)*[identifiers["<PAD>"]] for i in tmp]
    return (tmp, dataset[1])

train_dataset = transform_data(train_dataset)
dev_dataset = transform_data(dev_dataset)
test_dataset = transform_data(test_dataset)

### Building and running the model
We now build the dataset and model classes from our framework

In [None]:
# We build our dataset and delete the original data once it's done to save memory
dataset = fk.Dataset(name="toxic", train_data=train_dataset[0], train_labels=train_dataset[1], 
                                   dev_data=dev_dataset[0], dev_labels=dev_dataset[1],
                                   test_data=test_dataset[0], test_labels=test_dataset[1], 
                                   data_type='long', label_type='long', batch_size=128)
del train_dataset; del dev_dataset; del test_dataset

In [None]:
# model : LSTM_classifier / loss : L.Loss_1LSTM_1Linear6 / metric : M.BAC_6_classes_1_out
# model : LSTM_classifier_1LSTM / loss : L.Loss_1LSTM_6Linear / metric : M.BAC_6_classes_2_out 

In [None]:
# We define the model in our framework and force the garbage collector to clear memory
model = LSTM_classifier(len(identifiers.keys()), 20, 20, 20)
optimizer = optim.Adagrad(model.parameters(),lr=0.1, lr_decay = 0.01)
loss = L.Loss_1LSTM_1Linear6()
metric = M.BAC_6_classes_1_out()
encaps = fk.Model(name="one_model", model=model, loss=loss, optimizer=optimizer, metric=metric, dataset=dataset)
gc.collect()

In [None]:
encaps.train(epochs=3, verbose=2)

In [None]:
encaps.restore_best()
encaps.score()

In [None]:
40 0.608
30 0.636
20 0.636

10 20 0.629
30 20 0.624