# Official example
## Neural network baseline

Neural networks require a lot more compute time. It is much faster to train them on GPU. This is how to enable GPU at Kernel: https://www.kaggle.com/dansbecker/running-kaggle-kernels-with-a-gpu

In [1]:
import sys
import os
import csv
import time
csv.field_size_limit(sys.maxsize)  # needed for torchtext

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import TensorDataset, RandomSampler, DataLoader, SequentialSampler

import torchtext
from tqdm import tqdm

import sklearn.metrics as skm

from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig
from pytorch_pretrained_bert import BertModel
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear

from apex.optimizers import FP16_Optimizer
from apex.optimizers import FusedAdam

import logging
logger = logging.getLogger(__name__)

from tqdm import tqdm_notebook as tqdm

import pandas as pd
import hashlib
import json

In [2]:
import pytorch_pretrained_bert

In [3]:
pytorch_pretrained_bert.__version__

'0.6.1'

In [2]:
print(hashlib.algorithms_available)
print(hashlib.algorithms_guaranteed)

{'sha512-256', 'md4', 'sha1', 'shake_128', 'sha3_512', 'md5-sha1', 'sha512-224', 'shake256', 'sm3', 'mdc2', 'sha3-384', 'sha384', 'sha3-512', 'blake2b512', 'md5', 'sha512', 'sha3_224', 'shake128', 'shake_256', 'blake2s256', 'sha3-256', 'ripemd160', 'blake2b', 'sha224', 'sha256', 'blake2s', 'sha3-224', 'whirlpool', 'sha3_256', 'sha3_384'}
{'md5', 'sha512', 'sha1', 'sha256', 'sha3_224', 'shake_128', 'blake2s', 'shake_256', 'sha384', 'sha3_512', 'sha3_384', 'sha3_256', 'blake2b', 'sha224'}


In [3]:
assert torch.cuda.is_available(), 'We strongly reccomend using GPU for this kernel'

In [4]:
train = pd.read_csv("./data/train.csv")
valid = pd.read_csv("./data/valid.csv")
test  = pd.read_csv("./data/test.csv")

In [5]:
train = train.dropna()
valid = valid.dropna()

In [6]:
class InputExample(object):
    """A single training/test example for sequence classification."""

    def __init__(self, guid, text_a, text_b=None, labels=None):
        """Constructs a InputExample.
        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            labels: (Optional) [string]. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = labels

In [7]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id

In [8]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [9]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

In [10]:
def convert_examples_to_features(examples, label_list, max_seq_length,
                                 tokenizer, output_mode):
    """Loads a data file into a list of `InputBatch`s."""

    label_map = {label : i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in tqdm(enumerate(examples)):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))

        #print(example.text_a)
        try:
            tokens_a = tokenizer.tokenize(example.text_a)
        except Exception as e:
            print(example.text_a)
            print(e)
            continuenue

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        if example.label:
            if output_mode == "classification":
                label_id = label_map[example.label]
            elif output_mode == "regression":
                label_id = float(example.label)
            else:
                raise KeyError(output_mode)
        else:
            label_id = None

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=label_id))
    return features

In [11]:
def to_dataset(texts_a, texts_b=[], labels=[], tokenizer=tokenizer, seq_len=20):
    examples = []
    if len(labels) == 0:
        if len(texts_b) == 0:
            for text in texts_a:
                examples.append(
                    InputExample(
                        guid=hashlib.sha1(text.encode('utf-8')),
                        text_a=text, text_b=None,
                        labels=None)
                )
        else:
            for text_a, text_b in zip(texts_a, texts_b):
                examples.append(
                    InputExample(
                        guid=hashlib.sha1(text_a.encode('utf-8')),
                        text_a=text_a, text_b=text_b,
                        labels=None
                    )
                )
    else:
        if len(texts_b) == 0:
            for text, label in zip(texts_a, labels):
                examples.append(
                    InputExample(
                        guid=hashlib.sha1(text.encode('utf-8')),
                        text_a=text, text_b=None,
                        labels=label
                    )
                )
        else:
            for text_a, text_b, label in zip(texts_a, texts_b, labels):
                examples.append(
                    InputExample(
                        guid=hashlib.sha1(text_a.encode('utf-8')),
                        text_a=text_a, text_b=text_b,
                        labels=label
                    )
                )

    features = convert_examples_to_features(
        examples,
        label_list=['news', 'clickbait', 'other'],
        tokenizer=tokenizer,
        max_seq_length=seq_len,
        output_mode='classification'
    )
    
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    if len(labels) == 0:
        return TensorDataset(all_input_ids, all_input_mask, all_segment_ids)
    else:
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
        return TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

In [12]:
def save_bert(model, model_dir='./models/bert-tmp'):
    WEIGHTS_NAME = "pytorch_model.bin"
    CONFIG_NAME = "bert_config.json"
    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

     # If we save using the predefined names, we can load using `from_pretrained`
    output_model_file = os.path.join(model_dir, WEIGHTS_NAME)
    output_config_file = os.path.join(model_dir, CONFIG_NAME)

    torch.save(model_to_save.state_dict(), output_model_file)
    with open(output_config_file, 'w') as file:
        json.dump(vars(model_to_save.config), file)

In [13]:
device = torch.device('cuda')

In [14]:
#train_dataset = to_dataset(train['title'].astype(str), train['text'].astype(str), train['label'], seq_len=128)
#valid_dataset = to_dataset(valid['title'].astype(str), valid['text'].astype(str), valid['label'], seq_len=128)
train_dataset = torch.load("./train_dataset_128_plus_title.pth")
valid_dataset = torch.load("./valid_dataset_128_plus_title.pth")

#torch.save(train_dataset, "./train_dataset_128_plus_title.pth")
#torch.save(valid_dataset, "./valid_dataset_128_plus_title.pth")

In [15]:
n_train_samples = len(train_dataset)
print("Num of train examples: ", n_train_samples)

Num of train examples:  19877


In [54]:
params = {
    'lr': 1e-5,
    'batch_size': 128,
    'n_epochs': 60,
    'gradient_acc_steps': 2
}
num_train_optimization_steps: int = int(n_train_samples / params['batch_size'] / params['gradient_acc_steps']) * params['n_epochs']

In [55]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

model.half()
model = model.to(device)

model = model.train()

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = FusedAdam(optimizer_grouped_parameters, lr=params['lr'], bias_correction=False, max_grad_norm=1.0)
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)

In [56]:
torch.cuda.manual_seed_all(42)
torch.manual_seed(42)
np.random.seed(42)

In [57]:
train_sampler = RandomSampler(train_dataset)
valid_sampler = SequentialSampler(valid_dataset)

train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=params['batch_size'])
valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=params['batch_size'])

In [58]:
global_step = 0
nb_tr_steps = 0
tr_loss = 0

In [61]:
print(fscore)

[-1, 0.7050575828880619, 0.7288942833750354, 0.7455899921442185, 0.7639141211423398, 0.7563283836293252, 0.10684904351273665, 0.10684904351273665, 0.746050024811984, 0.6714329586038107, 0.7684720301922203, 0.7600649691462161, 0.7652313903893949, 0.7183338308270851, 0.7931381813930866, 0.7671942446043166, 0.7718973100202647, 0.7840931471736914, 0.7727988471989565, 0.7709787924484569, 0.7924420559886504, 0.7580348975647717, 0.7870048360273341, 0.795169479786562, 0.7937993927423835, 0.7888781889188001, 0.7898780878717666, 0.7654579737281438, 0.7972890592036626, 0.7837152248089295, 0.7874673517498657, 0.7829934278061739, 0.7885180062084767, 0.7836741901351701, 0.7775151992508751, 0.7856038269269051, 0.7741642509814469, 0.7858336192322547, 0.7874412769408526, 0.7898636579597996, 0.7821672480914458, 0.7906254487620631, 0.7897861142348958, 0.7893968347479698, 0.7893950342757776, 0.7888775588029681, 0.7888129465202233, 0.7888129465202233, 0.7885544069815088, 0.788353065683301, 0.78809363082851

In [62]:
max(fscore)

0.7972890592036626

In [38]:
fscore = [-1]

In [None]:
for ep_num in tqdm(range(params['n_epochs']), desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm(train_dataloader, desc="Training")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch
        
        logits = model(input_ids, segment_ids, input_mask, labels=None)
        #print(logits.shape)
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, 3), label_ids.view(-1))

        optimizer.backward(loss)

        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        
        if (step + 1) % params['gradient_acc_steps'] == 0:
            lr_this_step = 1e-4 * warmup_linear(global_step/num_train_optimization_steps, 0.1)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr_this_step
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1
        
    val_loss = 0
    val_steps = 0
    val_examples = 0
    model.eval()
    labels = []
    predictions = []
    with torch.no_grad():
        for step, batch in enumerate(tqdm(valid_dataloader, desc='Validating')):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch

            logits = model(input_ids, segment_ids, input_mask, labels=None)
            #print(logits.shape)
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, 3), label_ids.view(-1))
            y_pred = torch.argmax(logits, dim=-1)
            # move from GPU to CPU and convert to numpy array
            y_pred_numpy = y_pred.cpu().numpy()

            predictions = np.concatenate([predictions, y_pred_numpy])
            labels = np.concatenate([labels, label_ids.cpu().numpy()])
            
            val_loss += float(loss)
            val_steps += 1
            val_examples += input_ids.size(0)
            
    fsc = skm.f1_score(labels, predictions, average='macro')
    if fsc > max(fscore):
        print(f'f-score improved, saving model')
        save_bert(model)
        fscore.append(fsc)
    else:
        fscore.append(fsc)

    print(f"Mean Train Loss: {tr_loss/nb_tr_steps}")
    print(f"Mean Valid Loss: {val_loss/val_steps}")
    print(f"Valid F1:        {skm.f1_score(labels, predictions, average='macro')}")
    print("="*40)

HBox(children=(IntProgress(value=0, description='Epoch', max=60, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='Training', max=156, style=ProgressStyle(description_width='in…


Grad overflow on iteration 0
Using dynamic loss scale of 65536


HBox(children=(IntProgress(value=0, description='Validating', max=23, style=ProgressStyle(description_width='i…

Mean Train Loss: 0.7269506209935898
Mean Valid Loss: 0.40956978175951086
Valid F1:        0.7183338308270851


HBox(children=(IntProgress(value=0, description='Training', max=156, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='Validating', max=23, style=ProgressStyle(description_width='i…

f-score improved, saving model
Mean Train Loss: 0.3642632900140224
Mean Valid Loss: 0.36037809952445654
Valid F1:        0.7931381813930866


HBox(children=(IntProgress(value=0, description='Training', max=156, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='Validating', max=23, style=ProgressStyle(description_width='i…

Mean Train Loss: 0.2173794477413862
Mean Valid Loss: 0.4279380466627038
Valid F1:        0.7671942446043166


HBox(children=(IntProgress(value=0, description='Training', max=156, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='Validating', max=23, style=ProgressStyle(description_width='i…

Mean Train Loss: 0.0903628911727514
Mean Valid Loss: 0.6319354513417119
Valid F1:        0.7718973100202647


HBox(children=(IntProgress(value=0, description='Training', max=156, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='Validating', max=23, style=ProgressStyle(description_width='i…

Mean Train Loss: 0.059649002857697316
Mean Valid Loss: 0.7130889892578125
Valid F1:        0.7840931471736914


HBox(children=(IntProgress(value=0, description='Training', max=156, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='Validating', max=23, style=ProgressStyle(description_width='i…

Mean Train Loss: 0.05261426094250801
Mean Valid Loss: 0.8022059565005095
Valid F1:        0.7727988471989565


HBox(children=(IntProgress(value=0, description='Training', max=156, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='Validating', max=23, style=ProgressStyle(description_width='i…

Mean Train Loss: 0.04683087422297551
Mean Valid Loss: 0.6753173496412195
Valid F1:        0.7709787924484569


HBox(children=(IntProgress(value=0, description='Training', max=156, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='Validating', max=23, style=ProgressStyle(description_width='i…

Mean Train Loss: 0.029655927266830053
Mean Valid Loss: 0.6775183884993844
Valid F1:        0.7924420559886504


HBox(children=(IntProgress(value=0, description='Training', max=156, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='Validating', max=23, style=ProgressStyle(description_width='i…

Mean Train Loss: 0.02651399526840601
Mean Valid Loss: 0.947800346042799
Valid F1:        0.7580348975647717


HBox(children=(IntProgress(value=0, description='Training', max=156, style=ProgressStyle(description_width='in…

In [63]:
# Load a trained model and vocabulary that you have fine-tuned
model = BertForSequenceClassification.from_pretrained("./models/bert-tmp/", num_labels=3)

In [64]:
skm.f1_score(labels, predictions, average='macro')

0.7886128171902739

It can be really hard to beat SVM at text classification, but it is almost always possible with neural network.
This neural network may have slightly worse metrics than SVM. However, you can tweak hyperparameters and number of epochs or to change network architecture to get better results.

In [None]:
test_dataset = to_dataset(test['text'].astype(str), test['title'].astype(str), seq_len=128)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [None]:
test_sampler = SequentialSampler(test_dataset)
test_loader = DataLoader(test_dataset, sampler=test_sampler, batch_size=128)

In [None]:
index2label = ['news', 'clickbait', 'other']

## Make predictions on test set

In [None]:
predictions = []

model.eval()
model.to(device)
with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids, input_mask, segment_ids = batch
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        logits = model(input_ids, segment_ids, input_mask, labels=None)

        y_pred = torch.argmax(logits, dim=1)
        # move from GPU to CPU and convert to numpy array
        y_pred_numpy = y_pred.cpu().numpy()

        predictions = np.concatenate([predictions, y_pred_numpy])


In [None]:
predictions_str = [index2label[int(p)] for p in predictions]

# test.csv index in a contiguous integers from 0 to len(test_set)
# to this should work fine
submission = pd.DataFrame({'id': list(range(len(predictions_str))), 'label': predictions_str})
submission.to_csv('submission_bert_uncased_plus_title-0.csv', index=False)
submission.head()

In [31]:
test.head()

Unnamed: 0.1,Unnamed: 0,title,text
0,0,Amazon CEO Jeff Bezos is now the second riches...,More Try Yahoo Finance on Firefox » Amazon CEO...
1,1,Does Laura Dern Handle a Lightsaber in the New...,More Laura Dern seems to be everywhere these d...
2,2,"In this photographer’s home town, stepping out...",Kirkuk is a city of Northern Iraq in the Kurdi...
3,3,"8 Ways To Get Your Spouse To Open Up More, Acc...",Experts say that communication is the cornerst...
4,4,US says claim it supported IS in Syria is 'lud...,Share this with Email Facebook Messenger Messe...


In [32]:
submission.label.value_counts()

news         5180
other         384
clickbait      83
Name: label, dtype: int64