# Spacy PyTorch Transformers Demo

In [1]:
!nvidia-smi

Tue May  5 21:01:10 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

# Set-Up

Setting up the environment in Colab to run various experiments, note the cuda version of spacy-pytorch-transformers is being downloaded

In [0]:
!pip install gputil
!pip install torch #==1.1.0
!pip install spacy-pytorch-transformers[cuda100] #==0.2.0
!pip install --upgrade spacy
!pip install --upgrade spacy-pytorch-transformers

# !python -m spacy download en_pytt_bertbaseuncased_lg

Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/55/2e/ac00f5c9d01e66cc6ab75eb2a460c9b0dc21ad99a12f810c86a58309e63c/spacy-2.2.4-cp36-cp36m-manylinux1_x86_64.whl (10.6MB)
[K     |████████████████████████████████| 10.6MB 2.6MB/s 
Collecting preshed<3.1.0,>=3.0.2
[?25l  Downloading https://files.pythonhosted.org/packages/db/6b/e07fad36913879757c90ba03d6fb7f406f7279e11dcefc105ee562de63ea/preshed-3.0.2-cp36-cp36m-manylinux1_x86_64.whl (119kB)
[K     |████████████████████████████████| 122kB 53.3MB/s 
Collecting thinc==7.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/73/ed/8e4559f1090fb05c0fa982a8a2caaa315967e7b460652be479d13fd1c813/thinc-7.4.0-cp36-cp36m-manylinux1_x86_64.whl (2.2MB)
[K     |████████████████████████████████| 2.2MB 55.4MB/s 
Collecting blis<0.5.0,>=0.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/41/19/f95c75562d18eb27219df3a3590b911e78d131b68466ad79fdf5847eaac4/blis-0.4.1-cp36-cp36m-manylinux1_x86_64.whl (3.7M

In [0]:
!python -m spacy download en_pytt_xlnetbasecased_lg

You will need to **restart runtime after these installs** to reinstatiate the environment/directory

In [0]:
import spacy
import GPUtil
import torch
import numpy
from numpy.testing import assert_almost_equal
from scipy.spatial import distance
import cupy
import numpy as np

Checks whether GPU is available, switches to cuda if it is

In [4]:
is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
    print("Using GPU!")
    torch.set_default_tensor_type("torch.cuda.FloatTensor")
    print("GPU Usage")
    GPUtil.showUtilization()

Using GPU!
GPU Usage
| ID | GPU | MEM |
------------------
|  0 |  0% |  0% |


**Restart the kernel prior to running this section as the memory allocation on the GPU from the previous sections will cause the code to error**

Loading in additional libraries for this example

In [0]:
import thinc
import random
import spacy
import GPUtil
import torch
from spacy.util import minibatch
from tqdm.auto import tqdm
import unicodedata
import wasabi
import numpy
from collections import Counter

Ensuring GPU is in use: 
To run this example, ensure GPU MEM ~ 1% at start

In [6]:
spacy.util.fix_random_seed(0)
is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
    torch.set_default_tensor_type("torch.cuda.FloatTensor")
    print("GPU Usage")
    GPUtil.showUtilization()

GPU Usage
| ID | GPU | MEM |
------------------
|  0 |  1% |  2% |


In [0]:
def _prepare_partition(text_label_tuples, *, preprocess=False):
    texts, labels = zip(*text_label_tuples)
    cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
    return texts, cats

def load_data(*, limit=0, dev_size=2000):
    """Load data from the IMDB dataset, splitting off a held-out set."""
    if limit != 0:
        limit += dev_size
    assert dev_size != 0
    train_data, _ = thinc.extra.datasets.imdb(limit=limit)
    assert len(train_data) > dev_size
    random.shuffle(train_data)
    dev_data = train_data[:dev_size]
    train_data = train_data[dev_size:]
    train_texts, train_labels = _prepare_partition(train_data, preprocess=False)
    dev_texts, dev_labels = _prepare_partition(dev_data, preprocess=False)
    return (train_texts, train_labels), (dev_texts, dev_labels)

We can call the above functions to generate our training and testing data

In [0]:
(train_texts, train_cats), (eval_texts, eval_cats) = load_data()

next we'll select the pytt model we want to use to load into spacy

In [0]:
model_choice = "en_pytt_xlnetbasecased_lg" #@param ["en_pytt_bertbaseuncased_lg", "en_pytt_xlnetbasecased_lg"]

In [10]:
nlp = spacy.load(model_choice)
print(nlp.pipe_names)
print(f"Loaded model '{model_choice}'")
if model_choice == "en_pytt_xlnetbasecased_lg":
  textcat = nlp.create_pipe(
          "pytt_textcat", config={"architecture": "softmax_class_vector"}
      )
elif model_choice == "en_pytt_bertbaseuncased_lg":
  textcat = nlp.create_pipe(
          "pytt_textcat", config={"architecture": "softmax_class_vector"}
      )
else: 
  print("Choose a supported PyTT model")

['sentencizer', 'pytt_wordpiecer', 'pytt_tok2vec']
Loaded model 'en_pytt_xlnetbasecased_lg'


In [11]:
 # add label to text classifier
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")

1

In [12]:
print("Labels:", textcat.labels)
nlp.add_pipe(textcat, last=True)
print(f"Using {len(train_texts)} training docs, {len(eval_texts)} evaluation")

Labels: ('POSITIVE', 'NEGATIVE')
Using 23000 training docs, 2000 evaluation


In [0]:
# total_words = sum(len(text.split()) for text in train_texts)
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))

In [0]:
n_iter=4
n_texts=1000 #Changed number of texts to 75 to relieve pressue on GPU memory
batch_size=8 #8 #batch-szie changed to 4 to relieve pressure on GPU memory
learn_rate=2e-5
max_wpb=1000
pos_label="POSITIVE"

In [0]:
def cyclic_triangular_rate(min_lr, max_lr, period):
    it = 1
    while True:
        # https://towardsdatascience.com/adaptive-and-cyclical-learning-rates-using-pytorch-2bf904d18dee
        cycle = numpy.floor(1 + it / (2 * period))
        x = numpy.abs(it / period - 2 * cycle + 1)
        relative = max(0, 1 - x)
        yield min_lr + (max_lr - min_lr) * relative
        it += 1

In [0]:
def evaluate(nlp, texts, cats, pos_label):
    tp = 0.0  # True positives
    fp = 0.0  # False positives
    fn = 0.0  # False negatives
    tn = 0.0  # True negatives
    total_words = sum(len(text.split()) for text in texts)
    with tqdm(total=total_words, leave=False) as pbar:
        for i, doc in enumerate(nlp.pipe(texts, batch_size=batch_size)):
            gold = cats[i]
            for label, score in doc.cats.items():
                if label not in gold:
                    continue
                if label != pos_label:
                    continue
                if score >= 0.5 and gold[label] >= 0.5:
                    tp += 1.0
                elif score >= 0.5 and gold[label] < 0.5:
                    fp += 1.0
                elif score < 0.5 and gold[label] < 0.5:
                    tn += 1
                elif score < 0.5 and gold[label] >= 0.5:
                    fn += 1
            pbar.update(len(doc.text.split()))
    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

In [17]:
# Initialize the TextCategorizer, and create an optimizer.
optimizer = nlp.resume_training()
optimizer.alpha = 0.001
optimizer.pytt_weight_decay = 0.005
optimizer.L2 = 0.0
learn_rates = cyclic_triangular_rate(
    learn_rate / 3, learn_rate * 3, 2 * len(train_data) // batch_size
    )
print("Training the model...")
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))

pbar = tqdm(total=100, leave=False)
results = []
epoch = 0
step = 0
eval_every = 100
patience = 3
while True:
    # Train and evaluate
    losses = Counter()
    random.shuffle(train_data)
    batches = minibatch(train_data, size=batch_size)
    for batch in batches:
        optimizer.pytt_lr = next(learn_rates)
        texts, annotations = zip(*batch)
        nlp.update(texts, annotations, sgd=optimizer, drop=0.1, losses=losses)
        pbar.update(1)
        if step and (step % eval_every) == 0:
            pbar.close()
            with nlp.use_params(optimizer.averages):
                scores = evaluate(nlp, eval_texts, eval_cats, pos_label)
            results.append((scores["textcat_f"], step, epoch))
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(
                    losses["pytt_textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
            )
            pbar = tqdm(total=eval_every, leave=False)
        step += 1
    epoch += 1
    print(f"epoch {epoch}")
    # Stop if no improvement in HP.patience checkpoints
    if results:
        best_score, best_step, best_epoch = max(results)
        print(f"best score: {best_score}  best_step : {best_step}  best epoch : {best_epoch} ")
        print(f"break clause: {((step - best_step) // eval_every)}")
        if ((step - best_step) // eval_every) >= patience:
            break

    msg = wasabi.Printer()
    table_widths = [2, 4, 6]
    msg.info(f"Best scoring checkpoints")
    msg.row(["Epoch", "Step", "Score"], widths=table_widths)
    msg.row(["-" * width for width in table_widths])
    for score, step, epoch in sorted(results, reverse=True)[:10]:
        msg.row([epoch, step, "%.2f" % (score * 100)], widths=table_widths)

    # Test the trained model
    test_text = eval_texts[0]
    doc = nlp(test_text)
    print(test_text, doc.cats)

Training the model...
LOSS 	  P  	  R  	  F  


HBox(children=(IntProgress(value=0), HTML(value='')))



HBox(children=(IntProgress(value=0, max=477016), HTML(value='')))

0.614	0.825	0.965	0.889


HBox(children=(IntProgress(value=0), HTML(value='')))



HBox(children=(IntProgress(value=0, max=477016), HTML(value='')))



KeyboardInterrupt: ignored

In [0]:
nlp.to_disk("xlnet_sentiment")

In [1]:
!zip -r /content/xlnet_sentiment.zip /content/xlnet_sentiment

  adding: content/xlnet_sentiment/ (stored 0%)
  adding: content/xlnet_sentiment/pytt_wordpiecer/ (stored 0%)
  adding: content/xlnet_sentiment/pytt_wordpiecer/cfg (deflated 11%)
  adding: content/xlnet_sentiment/pytt_wordpiecer/model (deflated 49%)
  adding: content/xlnet_sentiment/tokenizer (deflated 79%)
  adding: content/xlnet_sentiment/meta.json (deflated 47%)
  adding: content/xlnet_sentiment/vocab/ (stored 0%)
  adding: content/xlnet_sentiment/vocab/vectors (deflated 45%)
  adding: content/xlnet_sentiment/vocab/strings.json (deflated 67%)
  adding: content/xlnet_sentiment/vocab/key2row (stored 0%)
  adding: content/xlnet_sentiment/vocab/lexemes.bin (deflated 75%)
  adding: content/xlnet_sentiment/pytt_textcat/ (stored 0%)
  adding: content/xlnet_sentiment/pytt_textcat/cfg (deflated 23%)
  adding: content/xlnet_sentiment/pytt_textcat/model (deflated 9%)
  adding: content/xlnet_sentiment/pytt_tok2vec/ (stored 0%)
  adding: content/xlnet_sentiment/pytt_tok2vec/cfg (deflated 54%)
  

In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [0]:
!cp /content/xlnet_sentiment.zip /content/drive/My\ Drive/spacy_models/xlnet_sentiment.zip