In [1]:
import os
import gc
import emoji
import re
import unicodedata
import torch
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
from typing import List
from keras.preprocessing import text as prep_text
from keras.utils.data_utils import pad_sequences
from scipy.stats import rankdata
from transformers import AutoTokenizer, AutoModelForSequenceClassification

This notebook was adapted from:

https://www.kaggle.com/code/chechir/bert-lstm-rank-blender/notebook

In [2]:
SEED = 14
DATA_PATH = '/'.join(os.getcwd().split("/")[:-1]) + '/data/jigsaw_unintended_bias/'
WORD_EMBEDDINGS = {
    'fasttext': '../word_vectors/crawl-300d-2M.vec',
    'glove': '../word_vectors/glove.840B.300d.txt'
}
TARGET_COLUMN = 'target'
IDENTITY_COLUMNS = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness'
]
NUM_MODELS = 1
EMBED_DIM = 300
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
MAX_LEN = 220  # max word embeddings per document
VOCAB_SIZE = 100000  # total distinct words or features - this limits the words to be embedded
EPOCHS = 1
BATCH_SIZE = 512

In [3]:
class MegaTextCleaner:
    def __init__(self, text: List[str]):
        self.text = text

    def translate_unicode_symbols(self, to: str = "utf-8"):
        self.text = [unicodedata.normalize('NFKD', t).encode(to, 'ignore').decode() for t in self.text]

    def remove_emojis(self):
        self.text = [emoji.replace_emoji(t, replace=' ') for t in self.text]
    
    def convert_to_lowercase(self):
        self.text = [t.lower() for t in self.text]

    def expand_contractions(self):
        contractions = {
            "ain't": "is not", 
            "aren't": "are not", 
            "can't": "cannot", 
            "'cause": "because", 
            "could've": "could have", 
            "couldn't": "could not", 
            "didn't": "did not", 
            "doesn't": "does not", 
            "don't": "do not", 
            "hadn't": "had not", 
            "hasn't": "has not", 
            "haven't": "have not",
            "he'd": "he would", 
            "he'd've": "he would have",
            "he'll": "he will", 
            "he'll've": "he will have",
            "he's": "he is", 
            "how'd": "how did", 
            "how'd'y": "how do you", 
            "how'll": "how will", 
            "how's": "how is", 
            "i'd": "i would", 
            "i'd've": "i would have", 
            "i'll": "i will", 
            "i'll've": "i will have", 
            "i'm": "i am", 
            "i've": "i have", 
            "isn't": "is not", 
            "it'd": "it would", 
            "it'd've": "it would have", 
            "it'll": "it will", 
            "it'll've": "it will have", 
            "it's": "it is", 
            "let's": "let us", 
            "ma'am": "madam", 
            "mayn't": "may not", 
            "might've": "might have", 
            "mightn't": "might not", 
            "mightn't've": "might not have", 
            "must've": "must have", 
            "mustn't": "must not", 
            "mustn't've": "must not have", 
            "needn't": "need not", 
            "needn't've": "need not have", 
            "o'clock": "of the clock", 
            "oughtn't": "ought not", 
            "oughtn't've": "ought not have", 
            "shan't": "shall not", 
            "sha'n't": "shall not", 
            "shan't've": "shall not have", 
            "she'd": "she would", 
            "she'd've": "she would have", 
            "she'll": "she will", 
            "she'll've": "she will have", 
            "she's": "she is", 
            "should've": "should have", 
            "shouldn't": "should not", 
            "shouldn't've": "should not have", 
            "so've": "so have", 
            "so's": "so as", 
            "this's": "this is", 
            "that'd": "that would", 
            "that'd've": "that would have", 
            "that's": "that is", 
            "there'd": "there would", 
            "there'd've": "there would have", 
            "there's": "there is", 
            "here's": "here is", 
            "they'd": "they would", 
            "they'd've": "they would have", 
            "they'll": "they will", 
            "they'll've": "they will have", 
            "they're": "they are", 
            "they've": "they have", 
            "to've": "to have", 
            "wasn't": "was not", 
            "we'd": "we would", 
            "we'd've": "we would have", 
            "we'll": "we will", 
            "we'll've": "we will have", 
            "we're": "we are", 
            "we've": "we have", 
            "weren't": "were not", 
            "what'll": "what will", 
            "what'll've": "what will have", 
            "what're": "what are", 
            "what's": "what is", 
            "what've": "what have", 
            "when's": "when is", 
            "when've": "when have", 
            "where'd": "where did", 
            "where's": "where is", 
            "where've": "where have", 
            "who'll": "who will", 
            "who'll've": "who will have", 
            "who's": "who is", 
            "who've": "who have", 
            "why's": "why is", 
            "why've": "why have", 
            "will've": "will have", 
            "won't": "will not", 
            "won't've": "will not have", 
            "would've": "would have", 
            "wouldn't": "would not", 
            "wouldn't've": "would not have", 
            "y'all": "you all", 
            "y'all'd": "you all would", 
            "y'all'd've": "you all would have", 
            "y'all're": "you all are", 
            "y'all've": "you all have", 
            "you'd": "you would", 
            "you'd've": "you would have", 
            "you'll": "you will", 
            "you'll've": "you will have", 
            "you're": "you are", 
            "you've": "you have",
        }
        for i in range(len(self.text)):
            for k, v in contractions.items():
                self.text[i] = self.text[i].replace(k, v)

    def clean_swear_words(self):
        swear_words = {
            "sh*t": "shit",
            "s**t": "shit",
            "sh*tty": "shitty",
            "sh**ty": "shitty",
            "sh*tting": "shitting",
            "f*ck": "fuck",
            "fu*k": "fuck",
            "f**k": "fuck",
            "f*cked": "fucked",
            "fu*ked": "fucked",
            "f***ed": "fucked",
            "effing": "fucking",
            "f*****g": "fucking",
            "f***ing": "fucking",
            "f**king": "fucking",
            "p*ssy": "pussy",
            "p***y": "pussy",
            "pu**y": "pussy",
            "p*ss": "piss",
            "b*tch": "bitch",
            "bit*h": "bitch",
            "h*ll": "hell",
            "h**l": "hell",
            "cr*p": "crap",
            "d*mn": "damn",
            "stu*pid": "stupid",
            "st*pid": "stupid",
            "n*gger": "nigger",
            "n***ga": "nigger",
            "f*ggot": "faggot",
            "f*g": "faggot",
            "scr*w": "screw",
            "pr*ck": "prick",
            "g*d": "god",
            "s*x": "sex",
            "a*s": "ass",
            "a$$": "ass",
            "a**hole": "asshole",
            "a***ole": "asshole",
            "a-hole": "asshole",
            "a**": "ass",
        }
        for i in range(len(self.text)):
            for k, v in swear_words.items():
                self.text[i] = self.text[i].replace(k, v)

    def remove_special_chars(self, special_chars: str = None):
        if special_chars is None:
            special_chars = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
        for i in range(len(self.text)):
            for sc in special_chars:
                self.text[i] = self.text[i].replace(sc, ' ' )

    def collapse_extra_whitespace(self):
        tab_newline_chars = "\n\r\t"
        for i in range(len(self.text)):
            for sc in tab_newline_chars:
                self.text[i] = self.text[i].replace(sc, ' ' )
            self.text[i] = re.compile(r'\s+').sub(" ", self.text[i]).strip()


In [4]:
def seed_everything(seed: int):
    """Ensures experiment will run deterministically for any given seed, even with CUDA"""
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


def preprocess(text: pd.Series):
    """Cleans and processes text.  Function execution order matters."""
    text = text.tolist()
    mtc = MegaTextCleaner(text=text)
    mtc.translate_unicode_symbols(to="ascii")
    mtc.remove_emojis()
    mtc.convert_to_lowercase()
    mtc.expand_contractions()
    mtc.clean_swear_words()
    mtc.remove_special_chars()
    mtc.collapse_extra_whitespace()
    return mtc.text


def get_coefs(word, *arr):
    """
    Converts a line from the embedding file to a tuple of (word, 32-bit numpy array)

    :param word: the first element in each line is the word
    :param arr: elements 2-n are the embedding dimensions
    """
    return word, np.asarray(arr, dtype='float32')


def load_embeddings(path: str):
    """
    Utility function to load word embeddings.  Each word embedding looks like:
    word 0.3 0.4 0.5 0.6 ...
    This function converts the embeddings to a dictionary of {word: numpy array}
    """
    with open(path, 'r', encoding='UTF-8') as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))


def get_word_embeddings(word_index: dict, path: str):
    """
    Maps words fround in the text (word_index) to their corresponding word embeddings from the 
    pre-trained model loaded from (path).  If any words cannot be found in the pre-trained model, 
    they are tracked in unknown_words.
    """
    embedding_index = load_embeddings(path)
    # create an empty matrix of shape (nbr_words, embed_dim)
    embedding_matrix = np.zeros((len(word_index) + 1, EMBED_DIM))
    unknown_words = []
    
    # map all words from the text to their embeddings, if they exist in the embedding index
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words


def sigmoid(x: np.ndarray):
    """Sigmoid activation function"""
    return 1 / (1 + np.exp(-x))


def custom_loss(data, targets):
    bce_loss_1 = torch.nn.BCEWithLogitsLoss(weight=targets[:,1:2])(data[:,:1], targets[:,:1])
    bce_loss_2 = torch.nn.BCEWithLogitsLoss()(data[:,1:], targets[:,2:])
    return (bce_loss_1 * loss_weight) + bce_loss_2


class SpatialDropout(torch.nn.Dropout2d):
    """
    Implements the functionality of Keras' SpatialDropout1D.
    Randomly drop features, i.e. [[1, 1, 1], [2, 1, 2]] -> [[1, 0, 1], [2, 0, 2]]
    Compare this with ordinary dropout that drops by sample, i.e. [[1, 1, 1], [2, 1, 2]] -> [[1, 0, 1], [0, 1, 2]]
    """
    def forward(self, x):
        x = x.unsqueeze(2)    # add a dimension of size 1 at position 2, producing (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # re-order dimensions to (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # re-order dimensions to (N, T, 1, K)
        x = x.squeeze(2)  # remove dimension of size 1 at position 2, producing (N, T, K)
        return x


class NeuralNet(torch.nn.Module):
    def __init__(self, embedding_matrix: np.ndarray, num_aux_targets: int):
        """Sets up neural network architecture"""
        super(NeuralNet, self).__init__()
        
        # set up a non-trainable, pre-trained embedding layer from the provided embedding_matrix
        self.embedding = torch.nn.Embedding(VOCAB_SIZE, EMBED_DIM)
        self.embedding.weight = torch.nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = SpatialDropout(0.3)  # randomly drop this percent of features
        
        # each bidirectional layer outputs 2 sequences: 1 forward, 1 backward, and concatenates them
        # so stacking 2 enriches the sequence features
        self.lstm1 = torch.nn.LSTM(
            input_size=EMBED_DIM, 
            hidden_size=LSTM_UNITS, 
            bidirectional=True, 
            batch_first=True
        )
        self.lstm2 = torch.nn.LSTM(
            input_size=LSTM_UNITS * 2, 
            hidden_size=LSTM_UNITS, 
            bidirectional=True, 
            batch_first=True
        )
    
        # skip connections...
        # add a product of a dense layer with the hidden layer to the output of the the hidden layer
        self.linear1 = torch.nn.Linear(in_features=DENSE_HIDDEN_UNITS, out_features=DENSE_HIDDEN_UNITS, bias=True)
        self.linear2 = torch.nn.Linear(in_features=DENSE_HIDDEN_UNITS, out_features=DENSE_HIDDEN_UNITS, bias=True)
        
        self.linear_out = torch.nn.Linear(DENSE_HIDDEN_UNITS, 1)
        
        # auxiliary outputs to be predicted as an alternative to the main output
        self.linear_aux_out = torch.nn.Linear(DENSE_HIDDEN_UNITS, num_aux_targets)
        
    def forward(self, x):
        """Implements forward pass"""
        h_embedding = self.embedding(x)
        h_embedding = self.embedding_dropout(h_embedding)
        
        h_lstm1, _ = self.lstm1(h_embedding)
        h_lstm2, _ = self.lstm2(h_lstm1)

        avg_pool = torch.mean(h_lstm2, 1)  # global mean pooling
        max_pool, _ = torch.max(h_lstm2, 1)  # global max pooling

        # concatenate to reshape from (batch_size, MAX_LEN, LSTM_UNITS * 2) to h_conc (BATCH_SIZE, LSTM_UNITS * 4)
        h_conc = torch.cat((max_pool, avg_pool), 1)
        h_conc_linear1  = F.relu(self.linear1(h_conc))
        h_conc_linear2  = F.relu(self.linear2(h_conc))
        
        hidden = h_conc + h_conc_linear1 + h_conc_linear2
        
        result = self.linear_out(hidden)
        aux_result = self.linear_aux_out(hidden)
        out = torch.cat([result, aux_result], 1)
        
        return out


def train_model(
    model, 
    train, 
    test, 
    loss_fn, 
    output_dim: int, 
    lr: float = 0.001,
    batch_size: int = 512, 
    n_epochs: int = 4, 
    enable_checkpoint_ensemble: bool = True
):
    """
    Trains a model on the training set.  
    """
    param_lrs = [{'params': param, 'lr': lr} for param in model.parameters()]
    optimizer = torch.optim.Adam(param_lrs, lr=lr)

    # decay the learning rate using a schedule of 0.6^epoch
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)

    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
    all_test_preds = []
    checkpoint_weights = [2 ** epoch for epoch in range(n_epochs)]
    
    for epoch in range(n_epochs):
        start_time = time.time()
        
        # increment learning rate schedule
        scheduler.step()
        
        model.train()
        avg_loss = 0.
        
        for data in tqdm(train_loader, disable=False):
            # the target is the final column in data
            x_batch = data[:-1]
            y_batch = data[-1]

            # forward pass and calculate loss
            y_pred = model(*x_batch)            
            loss = loss_fn(y_pred, y_batch)

            # zero out the gradients for the optimizer, now that the loss has been calculated
            optimizer.zero_grad()
            
            # backpropagate the loss
            loss.backward()

            # update the weights using the optimizer
            optimizer.step()
            
            # track the mean loss over all batches in this epoch
            avg_loss += loss.item() / len(train_loader)
            
        model.eval()
        test_preds = np.zeros((len(test), output_dim))

        # run each batch of the test data through the model
        for i, x_batch in enumerate(test_loader):
            y_pred = sigmoid(model(*x_batch).detach().cpu().numpy())
            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred

        # append the batch test_preds to the epoch's all_test_preds
        all_test_preds.append(test_preds)
        elapsed_time = time.time() - start_time
        print(f'Epoch {epoch + 1}/{n_epochs} \t loss={avg_loss:.4f} \t time={elapsed_time:.2f}s')

    if enable_checkpoint_ensemble:
        test_preds = np.average(all_test_preds, weights=checkpoint_weights, axis=0)    
    else:
        test_preds = all_test_preds[-1]
        
    return test_preds


def ensemble_predictions(predictions, weights, type_="linear"):
    """
    Combines predictions from several models.
    
    :param weights: sample weights
    """
    assert np.isclose(np.sum(weights), 1.0)  # weights should be close to 1.0
    
    # linear blend by taking the average between the models, weighted by sample
    if type_ == "linear":
        res = np.average(predictions, weights=weights, axis=0)

    # harmonic blend by taking the average of the reciprocals (handles outliers better than straight avg)
    elif type_ == "harmonic":
        res = np.average([1 / p for p in predictions], weights=weights, axis=0)
        return 1 / res

    # geometric blend handles discrepancies between the scales of the preds between the models
    elif type_ == "geometric":
        numerator = np.average(
            [np.log(p) for p in predictions], weights=weights, axis=0
        )
        res = np.exp(numerator / sum(weights))
        return res

    # ranks the predictions from the models and takes the mean rank, weighted by sample
    elif type_ == "rank":
        res = np.average([rankdata(p) for p in predictions], weights=weights, axis=0)
        return res / (len(res) + 1)

    return res


In [5]:
# expect ~2 Gb RAM for the data
train_df = pd.read_csv(DATA_PATH + 'train.csv')
test_df = pd.read_csv(DATA_PATH + 'test.csv')

x_train = preprocess(train_df['comment_text'])
# y_train to be defined after weighting the samples
y_aux_train = train_df[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
x_test = preprocess(test_df['comment_text'])

In [6]:
# weight the training samples
weights = np.ones((len(x_train),)) / 4
print(weights.mean(), weights.shape)

# sum the binary identity columns to give rows with identity values more weight (they contain more information)
weights += (train_df[IDENTITY_COLUMNS].fillna(0).values >= 0.5).sum(axis=1).astype(bool).astype(int) / 4

# Background Positive, Subgroup Negative
# when the target is 1, increase the weights further by counting the identity columns with 0 values
# this increases the weights for the positive class (toxic comment)
weights += (
    (
        (train_df['target'].values >= 0.5).astype(bool).astype(int) 
        + (train_df[IDENTITY_COLUMNS].fillna(0).values < 0.5).sum(axis=1).astype(bool).astype(int) 
    ) > 1
).astype(bool).astype(int) / 4

# Background Negative, Subgroup Positive
# when the target is 0, increase the weights by counting the identity columns with value of 1
# multiply by some constant (5) to give this weighting more impact
# this increases the weights for the negative class (not a toxic comment)
weights += (
    (
        (train_df['target'].values < 0.5).astype(bool).astype(int) 
        + (train_df[IDENTITY_COLUMNS].fillna(0).values >= 0.5).sum(axis=1).astype(bool).astype(int) 
    ) > 1 
).astype(bool).astype(int) / 4

# normalize the weights
loss_weight = 1.0 / weights.mean()

print("\n", loss_weight.shape, loss_weight.min(), loss_weight.max())

0.25 (1804874,)

 () 3.209226860170181 3.209226860170181


In [7]:
y_train = np.vstack([(train_df[TARGET_COLUMN].values >= 0.5).astype(int), weights]).T
y_train = torch.tensor(np.hstack([y_train, y_aux_train]), dtype=torch.float32)
print(y_train.shape)

torch.Size([1804874, 8])


In [8]:
tokenizer = prep_text.Tokenizer(num_words=VOCAB_SIZE, filters='', lower=True)
tokenizer.fit_on_texts(list(x_train) + list(x_test))

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = pad_sequences(x_train, maxlen=MAX_LEN)
x_test = pad_sequences(x_test, maxlen=MAX_LEN)

max_features = min(VOCAB_SIZE, len(tokenizer.word_index) + 1)
print(f"Max Features (Vocab Size) {max_features}")

Max Features (Vocab Size) 100000


In [9]:
# create word embeddings

fasttext_embeddings, fasttext_unknown_words = get_word_embeddings(tokenizer.word_index, WORD_EMBEDDINGS['fasttext'])
print('Unknown words (fast text): ', len(fasttext_unknown_words))

glove_embeddings, glove_unknown_words = get_word_embeddings(tokenizer.word_index, WORD_EMBEDDINGS['glove'])
print('Unknown words (glove): ', len(glove_unknown_words))

1999996it [01:01, 32658.55it/s]


Unknown words (fast text):  165824


2196017it [01:07, 32570.67it/s]


Unknown words (glove):  162077


In [10]:
embedding_matrix = np.concatenate([fasttext_embeddings, glove_embeddings], axis=-1)
print("Embedding matrix shape: ", embedding_matrix.shape)

del fasttext_embeddings
del glove_embeddings
gc.collect()

Embedding matrix shape:  (318636, 600)


0

In [None]:
# move data to CUDA
x_train_torch = torch.tensor(x_train, dtype=torch.long)#.cuda()
x_test_torch = torch.tensor(x_test, dtype=torch.long)#.cuda()
y_train_torch = torch.tensor(np.hstack([y_train[:, np.newaxis], y_aux_train]), dtype=torch.float32)#.cuda()

# convert to tensor datasets
train_dataset = data.TensorDataset(x_train_torch, y_train_torch)
test_dataset = data.TensorDataset(x_test_torch)

In [None]:
all_test_preds = []

# train NUM_MODELS models and average their output for the final predictions
for model_idx in range(NUM_MODELS):
    print('\nModel ', model_idx)

    # fit each model with a different seed, otherwise they will be identical
    seed_everything(SEED + model_idx)
    
    model = NeuralNet(embedding_matrix, y_aux_train.shape[-1])
    #model.cuda()
    
    test_preds = train_model(
        model, 
        train_dataset, 
        test_dataset, 
        output_dim=y_train_torch.shape[-1],
        loss_fn=nn.BCEWithLogitsLoss(reduction='mean')
    )
    all_test_preds.append(test_preds)

In [None]:
submission_lstm = pd.DataFrame.from_dict({
    'id': test_df['id'],
    'prediction': np.mean(all_test_preds, axis=0)[:, 0]
})

In [11]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

### Dataset vs DataLoader vs DataCollator

#### Dataset
A tensor dataset is similar to a Pandas dataframe.  It just stores the data.

#### DataLoader
A Pytorch iterator over a tensor dataset.  This is used to form batches.  It has an argument, collate_fn, that can be used for on-the-fly transformations of the batches.

#### DataCollator
A HuggingFace class for forming batches and performing on-the-fly padding/truncation of the batches (or other transformations).  Under the hood, HuggingFace passes the DataCollator to Pytorch's DataLoader as the collate_fn argument. If you perform padding/truncation on the data before creating a tensor dataset, then you do not need a DataCollator, although a DataCollator can be more efficient.

In [13]:
def convert_lines(example, max_seq_length, tokenizer):
    """
    Tokenizes text to uniform shape using padding and truncation.  
    This function (and the DataLoader) could be replaced by a DataCollator object that would 
    do everything on the fly, rather than ahead of time.
    """
    max_seq_length -=2
    all_tokens = []
    nbr_texts_longer_than_max_seq_length = 0
    for text in tqdm(example):
        tokens_a = tokenizer.tokenize(text)
        # restrict tokenized text to max_seq_length
        if len(tokens_a) > max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            nbr_texts_longer_than_max_seq_length += 1
        # insert CLS and SEP tokens around the tokenized text, and zero pad if the length < max_seq_length
        one_token = tokenizer.convert_tokens_to_ids(
            ["[CLS]"] + tokens_a + ["[SEP]"]
        ) + [0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    return np.array(all_tokens)

In [14]:
# BERT is pre-trained so it is ready to make predictions for the test set
# the test set must be tokenized first...
X_test = convert_lines(test_df["comment_text"].fillna("DUMMY_VALUE"), MAX_LEN, tokenizer)

test_preds = np.zeros((len(X_test)))

# convert the tokenized test set to a TensorDataset (the kind of dataset HuggingFace likes)
X_test_dataset = torch.utils.data.TensorDataset(torch.tensor(X_test))

# now pass the dataset to a DataLoader (an iterator wrapper for a TensorDataset), which will load it by batch
X_test_dataloader = torch.utils.data.DataLoader(X_test_dataset, batch_size=512, shuffle=False)

100%|██████████| 97320/97320 [00:18<00:00, 5399.16it/s]


In [42]:
device = torch.device('cpu')
all_logits = np.empty([0, 2])
for batch in X_test_dataloader:
    
    input_ids = tuple(b.to(device) for b in batch)[0]
    attention_masks = tuple((b > 0).to(device) for b in batch)[0]  # False where zero-padded

    with torch.no_grad():        
        outputs = model(input_ids, attention_masks)

    logits = outputs[0]

    all_logits = np.vstack([all_logits, torch.softmax(logits, dim=1).detach().cpu().numpy()])
    # all_logits = torch.sigmoid(torch.tensor(all_logits)).numpy().ravel()

KeyboardInterrupt: 

In [None]:
submission_bert = pd.DataFrame.from_dict({
    'id': test_df['id'],
    'prediction': all_logits
})

In [None]:
weights = [0.333, 0.667]
submission["prediction"] = ensemble_predictions(
    [submission_bert.prediction.values, submission_lstm.prediction.values],
    weights,
    type_="rank",
)
submission.to_csv("submission.csv", index=False)