In [1]:
import re
import numpy as np
import torch as th
import torch.autograd as ag
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import pandas as pd

import framework as fk
import metrics as M
import losses as L

In [2]:
# Tokenize a sentence
def clean_str(string, tolower=True):
    """
    Tokenization/string cleaning.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    if tolower:
        string = string.lower()
    return string.strip()

In [3]:
LABEL_OF_THE_CLASS = 1

def load_data(label=-1):
    train_data_raw = pd.read_csv('data/train.csv.zip', compression='zip').values.tolist()
    train_data = [clean_str(i[1]).split(" ") for i in train_data_raw]
    train_labels = [[float(j) for j in i[2:]] for i in train_data_raw]
    
    test_labels_raw = pd.read_csv('data/test_labels.csv.zip', compression='zip').values.tolist()
    test_labels = [[float(j) for j in i[1:]] for i in test_labels_raw]
    indices = [i for i in range(len(test_labels)) if test_labels[i][0] != -1]
    test_labels = [test_labels[i] for i in indices]
    test_data_raw = pd.read_csv('data/test.csv.zip', compression='zip').values.tolist()
    test_data = [clean_str(i[1]).split(" ") for i in test_data_raw]
    test_data = [test_data[i] for i in indices]
    
    if label==-1:
        train_dataset = (train_data[:100000], train_labels[:100000])
        dev_dataset = (train_data[152000:153000], train_labels[152000:153000])
        test_dataset = (test_data[:2000], test_labels[:2000])
    else:
        train_dataset = (train_data[:100000], [[i[label]] for i in train_labels[:100000]])
        dev_dataset = (train_data[152000:153000], [[i[label]] for i in train_labels[152000:153000]])
        test_dataset = (test_data[:2000], [[i[label]] for i in test_labels[:2000]])    
    return train_dataset, dev_dataset, test_dataset

train_dataset, dev_dataset, test_dataset = load_data(label=LABEL_OF_THE_CLASS)
for j in range(len(train_dataset[1][0])):
    weight = sum(i[j] for i in train_dataset[1])
    print("Weight of label",j,"is",(100000-weight)/weight)

Weight of label 0 is 99.10010010010011


## Creating our model

In [4]:
class LSTM_classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, hidden_dim2):
        super(LSTM_classifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.linear = nn.Linear(2*hidden_dim, hidden_dim2)
        self.linear2 = nn.Linear(hidden_dim2, 6)
        self.bn1 = nn.BatchNorm1d(num_features=hidden_dim2)
        self.dropout = nn.Dropout(0.2)
    
    def last_timestep(self, unpacked, lengths):
        # Index of the last output for each sequence.
        idx = (lengths - 1).view(-1, 1).expand(unpacked.size(0),
                                               unpacked.size(2)).unsqueeze(1)
        return unpacked.gather(1, idx).squeeze()

    def forward(self, inputs, input_lengths):
        batch_size, seq_len = inputs.size()
        out = self.embedding(inputs)
        out = th.nn.utils.rnn.pack_padded_sequence(out, input_lengths, batch_first=True, enforce_sorted=False)
        out, (h0,h1) = self.lstm(out)
        out, _ = th.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        out = self.last_timestep(out, input_lengths)
        # out = self.bn1(th.relu(self.linear(self.dropout(out))))
        out = th.relu(self.linear(self.dropout(out)))
        out = self.linear2(out)
        
        return out
    
    def make_input(self, sentences):
        X_lengths = [len(sentence) for sentence in sentences]
        
        padding_token = identifiers['<PAD>']
        
        longest_sent = max(X_lengths)
        batch_size = len(sentences)
        padded_X = np.ones((batch_size, longest_sent)) * padding_token
        
        # copy over the actual sequences
        for i, x_len in enumerate(X_lengths):
            sequence = sentences[i]
            padded_X[i, 0:len(sequence)] = sequence
    
        return th.from_numpy(padded_X).long(), th.from_numpy(np.array([len(sentence) for sentence in sentences])).long()

In [5]:
class LSTM_classifier_single(nn.Module):
    def __init__(self, hidden_dim, hidden_dim2, emb_weights, pad_id):
        super(LSTM_classifier_single, self).__init__()
        self.padding_id = pad_id
        vocab_size = emb_weights.shape[0]
        embedding_dim = emb_weights.shape[1]
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_id)
        self.embedding.weight = nn.Parameter(th.tensor(emb_weights, dtype=th.float32))
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.linear = nn.Linear(2*hidden_dim, hidden_dim2)
        self.linear2 = nn.Linear(hidden_dim2, 2)
        self.bn1 = nn.BatchNorm1d(num_features=hidden_dim2)
        self.dropout = nn.Dropout(0.2)
    
    def last_timestep(self, unpacked, lengths):
        # Index of the last output for each sequence.
        idx = (lengths - 1).view(-1, 1).expand(unpacked.size(0),
                                               unpacked.size(2)).unsqueeze(1)
        return unpacked.gather(1, idx).squeeze()

    def forward(self, inputs):
        batch_size, seq_len = inputs.size()
        a = []
        for i in inputs:
            a += [max(np.where(th.LongTensor(i).numpy()==self.padding_id)[0][0],1)]
        input_lengths = th.LongTensor(a)
        out = self.embedding(inputs)
        out = th.nn.utils.rnn.pack_padded_sequence(out, input_lengths, batch_first=True, enforce_sorted=False)
        out, (h0,h1) = self.lstm(out)
        out, _ = th.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        out = self.last_timestep(out, input_lengths)
        # out = self.bn1(th.relu(self.linear(self.dropout(out))))
        out = th.relu(self.linear(self.dropout(out)))
        out = self.linear2(out)
        
        return out
    
    def make_input(self, sentences):
        X_lengths = [len(sentence) for sentence in sentences]
        
        padding_token = identifiers['<PAD>']
        
        longest_sent = max(X_lengths)
        batch_size = len(sentences)
        padded_X = np.ones((batch_size, longest_sent)) * padding_token
        
        # copy over the actual sequences
        for i, x_len in enumerate(X_lengths):
            sequence = sentences[i]
            padded_X[i, 0:len(sequence)] = sequence
    
        return th.from_numpy(padded_X).long(), th.from_numpy(np.array([len(sentence) for sentence in sentences])).long()

In [6]:
class LSTM_classifier_1LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, hidden_dim2):
        super(LSTM_classifier_1LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.class0 = nn.Sequential(nn.Dropout(0.2), nn.Linear(2*hidden_dim, hidden_dim2), nn.Linear(hidden_dim2, 1))
        self.class1 = nn.Sequential(nn.Dropout(0.2), nn.Linear(2*hidden_dim, hidden_dim2), nn.Linear(hidden_dim2, 1))
        self.class2 = nn.Sequential(nn.Dropout(0.2), nn.Linear(2*hidden_dim, hidden_dim2), nn.Linear(hidden_dim2, 1))
        self.class3 = nn.Sequential(nn.Dropout(0.2), nn.Linear(2*hidden_dim, hidden_dim2), nn.Linear(hidden_dim2, 1))
        self.class4 = nn.Sequential(nn.Dropout(0.2), nn.Linear(2*hidden_dim, hidden_dim2), nn.Linear(hidden_dim2, 1))
        self.class5 = nn.Sequential(nn.Dropout(0.2), nn.Linear(2*hidden_dim, hidden_dim2), nn.Linear(hidden_dim2, 1))
    
    def last_timestep(self, unpacked, lengths):
        # Index of the last output for each sequence.
        idx = (lengths - 1).view(-1, 1).expand(unpacked.size(0),
                                               unpacked.size(2)).unsqueeze(1)
        return unpacked.gather(1, idx).squeeze()

    def forward(self, inputs, input_lengths):
        batch_size, seq_len = inputs.size()
        out = self.embedding(inputs)
        out = th.nn.utils.rnn.pack_padded_sequence(out, input_lengths, batch_first=True, enforce_sorted=False)
        out, (h0,h1) = self.lstm(out)
        out, _ = th.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        out = self.last_timestep(out, input_lengths)
        
        mult_out = [model(out) for model in [self.class0, self.class1, self.class2, self.class3, self.class4, self.class5]]
        
        return mult_out
    
    def make_input(self, sentences):
        X_lengths = [len(sentence) for sentence in sentences]
        
        padding_token = identifiers['<PAD>']
        
        longest_sent = max(X_lengths)
        batch_size = len(sentences)
        padded_X = np.ones((batch_size, longest_sent)) * padding_token
        
        # copy over the actual sequences
        for i, x_len in enumerate(X_lengths):
            sequence = sentences[i]
            padded_X[i, 0:len(sequence)] = sequence
    
        return th.from_numpy(padded_X).long(), th.from_numpy(np.array([len(sentence) for sentence in sentences])).long()

## Evaluating

In [8]:
class Metric(M.Metric):
    def reset(self):
        self.totals = [0 for i in range(12)]
        self.corrects = [0 for i in range(12)]
    
    def step(self, in_data, out_data, labels):
        for i,j in zip(out_data, labels):
            for k in range(len(j)):
                self.totals[2*k+int(j[k])] += 1
                if (i[2*k+1] > i[2*k]) == (j[k]==1):
                    self.corrects[2*k+int(j[k])] += 1
        
    def score(self):
        s = 0
        t = 0
        for i in range(12):
            if self.totals[i] > 0:
                t += self.corrects[i] / self.totals[i]
                s += 1
        return t/s
    
class Loss:
    def __init__(self):
        self.l = nn.CrossEntropyLoss(weight=th.Tensor([1.,99.]))
        
    def compute(self, in_data, out_data, labels):
        return self.l(out_data, labels.squeeze())

## Running our computations

### Creating our dictionnary

In [9]:
from keras.preprocessing import text

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(train_dataset[0]))

# Padding
tokenizer.word_index["<PAD>"] = len(tokenizer.word_index.keys())
padding_idx = tokenizer.word_index["<PAD>"]

Using TensorFlow backend.


In [10]:
def transform_data(dataset):
    tmp = [[tokenizer.word_index[j] for j in i if j in tokenizer.word_index.keys()] for i in dataset[0]]
    m = max(len(i) for i in tmp)
    tmp = [i+(m-len(i)+1)*[tokenizer.word_index["<PAD>"]] for i in tmp]
    return (tmp, dataset[1])

train_dataset = transform_data(train_dataset)
dev_dataset = transform_data(dev_dataset)
test_dataset = transform_data(test_dataset)

In [11]:
# Load pretrained embedding
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)

def build_matrix(word_index, path, emb_size):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, emb_size))
    unknown_words = []
    
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix

emb_weights = build_matrix(tokenizer.word_index, './fte_300_100', 300)

In [12]:
dataset = fk.Dataset(name="toxic", train_data=train_dataset[0], train_labels=train_dataset[1], 
                                   dev_data=dev_dataset[0], dev_labels=dev_dataset[1],
                                   test_data=test_dataset[0], test_labels=test_dataset[1], 
                                   data_type='long', label_type='long', batch_size=128)
del train_dataset; del dev_dataset; del test_dataset

In [13]:
model = LSTM_classifier_single(20, 20, emb_weights, padding_idx)
loss = Loss()
optimizer = optim.Adagrad(model.parameters(),lr=0.1, lr_decay = 0.01)
metric = Metric()

encaps = fk.Model(name="lstm", model=model, loss=loss, optimizer=optimizer, metric=metric, dataset=dataset)

In [14]:
import gc
gc.collect()

40

In [None]:
fk.device = th.device("cpu")
encaps.train(epochs=1, verbose=2)

Batch 1/782 done - Avg. loss = tensor(4.1559e-05, grad_fn=<DivBackward0>)
Batch 2/782 done - Avg. loss = tensor(6.9795e-05, grad_fn=<DivBackward0>)
Batch 3/782 done - Avg. loss = tensor(6.3574e-05, grad_fn=<DivBackward0>)
Batch 4/782 done - Avg. loss = tensor(6.2298e-05, grad_fn=<DivBackward0>)
Batch 5/782 done - Avg. loss = tensor(5.9775e-05, grad_fn=<DivBackward0>)
Batch 6/782 done - Avg. loss = tensor(5.6810e-05, grad_fn=<DivBackward0>)
Batch 7/782 done - Avg. loss = tensor(5.3790e-05, grad_fn=<DivBackward0>)
Batch 8/782 done - Avg. loss = tensor(5.1164e-05, grad_fn=<DivBackward0>)
Batch 9/782 done - Avg. loss = tensor(4.9047e-05, grad_fn=<DivBackward0>)
Batch 10/782 done - Avg. loss = tensor(5.8879e-05, grad_fn=<DivBackward0>)
Batch 11/782 done - Avg. loss = tensor(5.7022e-05, grad_fn=<DivBackward0>)
Batch 12/782 done - Avg. loss = tensor(5.5005e-05, grad_fn=<DivBackward0>)
Batch 13/782 done - Avg. loss = tensor(5.3416e-05, grad_fn=<DivBackward0>)
Batch 14/782 done - Avg. loss = te

Batch 110/782 done - Avg. loss = tensor(2.6326e-05, grad_fn=<DivBackward0>)
Batch 111/782 done - Avg. loss = tensor(2.6125e-05, grad_fn=<DivBackward0>)
Batch 112/782 done - Avg. loss = tensor(2.6034e-05, grad_fn=<DivBackward0>)
Batch 113/782 done - Avg. loss = tensor(2.6371e-05, grad_fn=<DivBackward0>)
Batch 114/782 done - Avg. loss = tensor(2.6820e-05, grad_fn=<DivBackward0>)
Batch 115/782 done - Avg. loss = tensor(2.6692e-05, grad_fn=<DivBackward0>)
Batch 116/782 done - Avg. loss = tensor(2.6537e-05, grad_fn=<DivBackward0>)
Batch 117/782 done - Avg. loss = tensor(2.6369e-05, grad_fn=<DivBackward0>)
Batch 118/782 done - Avg. loss = tensor(2.6200e-05, grad_fn=<DivBackward0>)
Batch 119/782 done - Avg. loss = tensor(2.6042e-05, grad_fn=<DivBackward0>)
Batch 120/782 done - Avg. loss = tensor(2.6198e-05, grad_fn=<DivBackward0>)
Batch 121/782 done - Avg. loss = tensor(2.6065e-05, grad_fn=<DivBackward0>)
Batch 122/782 done - Avg. loss = tensor(2.5940e-05, grad_fn=<DivBackward0>)
Batch 123/78

Batch 217/782 done - Avg. loss = tensor(2.2280e-05, grad_fn=<DivBackward0>)
Batch 218/782 done - Avg. loss = tensor(2.2244e-05, grad_fn=<DivBackward0>)
Batch 219/782 done - Avg. loss = tensor(2.2195e-05, grad_fn=<DivBackward0>)
Batch 220/782 done - Avg. loss = tensor(2.2158e-05, grad_fn=<DivBackward0>)
Batch 221/782 done - Avg. loss = tensor(2.2078e-05, grad_fn=<DivBackward0>)
Batch 222/782 done - Avg. loss = tensor(2.2017e-05, grad_fn=<DivBackward0>)
Batch 223/782 done - Avg. loss = tensor(2.1973e-05, grad_fn=<DivBackward0>)
Batch 224/782 done - Avg. loss = tensor(2.1943e-05, grad_fn=<DivBackward0>)
Batch 225/782 done - Avg. loss = tensor(2.1882e-05, grad_fn=<DivBackward0>)
Batch 226/782 done - Avg. loss = tensor(2.1828e-05, grad_fn=<DivBackward0>)
Batch 227/782 done - Avg. loss = tensor(2.1886e-05, grad_fn=<DivBackward0>)
Batch 228/782 done - Avg. loss = tensor(2.1834e-05, grad_fn=<DivBackward0>)
Batch 229/782 done - Avg. loss = tensor(2.1823e-05, grad_fn=<DivBackward0>)
Batch 230/78

In [16]:
encaps.score()

Metric score is : 0.918404504898178


0.918404504898178

In [None]:
0.918