# Text classification with spaCy

Train a convolutional neural network text classifier on the
IMDB dataset, using the TextCategorizer component. The dataset will be loaded
automatically via Thinc's built-in dataset loader. The model is added to
spacy.pipeline, and predictions are available via `doc.cats`.

Requires:
* spacy: `pip install -U spacy`
* english model 'en': `python -m spacy download en`

## Setting up Verta

In [None]:
# restart your notebook if prompted
try:
    import verta
except ModuleNotFoundError:
    !pip install verta

In [1]:
# setting up

HOST = 'demo.verta.ai'
EMAIL = 'email@gmail.com'
DEV_KEY= 'dev-key'
PROJECT_NAME = 'spaCy'
EXPERIMENT_NAME = 'text-clf'

In [None]:
from verta import Client
from verta.utils import ModelAPI

client = Client(host=HOST,
                email=EMAIL, 
                dev_key=DEV_KEY,
                use_git=False)

# creating a project and experiment

proj = client.set_project(PROJECT_NAME)
expt = client.set_experiment(EXPERIMENT_NAME)
run = client.set_experiment_run()

## Imports

In [None]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import thinc.extra.datasets

import spacy
from spacy.util import minibatch, compounding

## Helper functions

In [None]:
def load_data(limit=0, split=0.8):
    """Load data from the IMDB dataset."""
    # Partition off part of the dataset to train and test
    train_data, _ = thinc.extra.datasets.imdb()
    random.shuffle(train_data)    
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

In [None]:
def evaluate(tokenizer, textcat, texts, cats):
    """Evaluate with text data, calculates precision, recall and f score"""
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "NEGATIVE":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

## Training model

In [None]:
hyperparams = {
    'model':'en',
    'n_iter':10, # epochs
    'n_texts':500, # num of training samples
    'architecture': 'simple_cnn',
    'num_samples':1000,
    'train_test_split':0.8,
    'dropout':0.2
  }
run.log_hyperparameters(hyperparams)

In [None]:
# using the basic en model
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()

nlp = spacy.load(hyperparams['model'])  # load en spaCy model
print("Loaded model '%s'" % hyperparams['model'])

# add the text classifier to the pipeline if it doesn't exist
if "textcat" not in nlp.pipe_names:
    textcat = nlp.create_pipe(
        "textcat",
        config={
            "exclusive_classes": True,
            "architecture": hyperparams['architecture'],
        }
    )
    nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:
    textcat = nlp.get_pipe("textcat")

# add label to text classifier
_= textcat.add_label("POSITIVE")
_= textcat.add_label("NEGATIVE")

In [None]:
# load the IMDB dataset
print("Loading IMDB data...")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=hyperparams['num_samples'],
                                                             split=hyperparams['train_test_split'])
print(
    "Using {} examples ({} training, {} evaluation)".format(
        hyperparams['num_samples'], len(train_texts), len(dev_texts)
    )
)
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))

In [None]:
# sample train data
train_data[:1]

In [None]:
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
print("other pipes:", other_pipes)
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    print("Training the model...")
    print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
    batch_sizes = compounding(4.0, 32.0, 1.001)
    for i in range(n_iter):
        losses = {}
        # batch up the examples using spaCy's minibatch
        random.shuffle(train_data)
        batches = minibatch(train_data, size=batch_sizes)
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=hyperparams['dropout'], losses=losses)
        with textcat.model.use_params(optimizer.averages):
            # evaluate on the dev data split off in load_data()
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        print(
            "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                losses["textcat"],
                scores["textcat_p"],
                scores["textcat_r"],
                scores["textcat_f"],
            ) 
        )
        run.log_observation('loss', losses['textcat'])
        run.log_observation('precision', scores['textcat_p'])
        run.log_observation('recall', scores['textcat_r'])
        run.log_observation('f_score', scores['textcat_f'])

## Saving models

In [None]:
# test the trained model
test_text = 'The Lion King was very entertaining. The movie was visually spectacular.'
doc = nlp(test_text)
print(test_text, doc.cats)
run.log_model('final_model', nlp)

In [None]:
# test the logged model
print("Loading from verta - ")
nlp2 = run.get_model('final_model')
doc2 = nlp2(test_text)
print(test_text, doc2.cats)

In [None]:
# logging model details
summary = spacy.info(hyperparams['model'])
with open("model_summary.txt", "w") as text_file:
    text_file.write(json.dumps(summary))
run.log_artifact('model_summary', 'model_summary.txt')