In [0]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Snippet: Self-made 3 Versions of Preprocessing Text

1. Bare Token: Sans grammer, sans semantics

```python
def token_cleaner(text):
    text = strip_multiple_whitespaces(text)
    text = remove_stopwords(text)
    text = strip_numeric(text) 
    text = strip_non_alphanum(text)
    text = strip_punctuation(text)
    text = strip_short(text, minsize=3)
    text = [ tok.lemma_.lower().strip() for tok in nlp(text, disable=['tagger', 'parser', 'ner']) ]
    text = [ tok for tok in text if tok not in SYMBOLS and tok not in STOPLIST ]
    return ' '.join(text)
```

2. Lemmas: Retain grammar and semantcis

```python
def token_cleaner(text):
    text = strip_multiple_whitespaces(text)
    text = strip_non_alphanum(text)
    text = strip_punctuation(text)
    text = strip_short(text, minsize=3) # optional
    text = [ tok.lemma_.lower().strip() for tok in nlp(text, disable=['tagger', 'parser', 'ner']) ]
    text = [ tok for tok in text if tok not in SYMBOLS ]
    return ' '.join(text)
```

3. Clean Text: Remove non-text only

```python
def token_cleaner(text):
    text = strip_multiple_whitespaces(text)
    text = strip_non_alphanum(text)
    text = strip_punctuation(text)
    text = strip_short(text, minsize=3) # optional
    text = [ tok.text.lower().strip() for tok in nlp(text, disable=['tagger', 'parser', 'ner']) ]
    text = [ tok for tok in text if tok not in SYMBOLS ]
    return ' '.join(text)
```

### SpaCy Trick to speed up above by applying on whole text

```python
def doc_to_spans(list_of_texts, join_string=' ||| '):
    all_docs = nlp(' ||| '.join(list_of_texts))
    split_inds = [i for i, token in enumerate(all_docs) if token.text == '|||'] + [len(all_docs)]
    new_docs = [all_docs[(i + 1 if i > 0 else i):j] for i, j in zip([0] + split_inds[:-1], split_inds)]
    return new_docs 
```

# Snippet: Train Text Classification (TextCat pipe)

### Step by step guide

1. Load the model you want to start with, or create an empty model using spacy.blank with the ID of your language. If you’re using an existing model, make sure to disable all other pipeline components during training using nlp.disable_pipes. This way, you’ll only be training the text classifier.
2. Add the text classifier to the pipeline, and add the labels you want to train – for example, POSITIVE.
3. Load and pre-process the dataset, shuffle the data and split off a part of it to hold back for evaluation. This way, you’ll be able to see results on each training iteration.
4. Loop over the training examples and partition them into batches using spaCy’s minibatch and compounding helpers.
5. Update the model by calling nlp.update, which steps through the examples and makes a prediction. It then consults the annotations to see whether it was right. If it was wrong, it adjusts its weights so that the correct prediction will score higher next time.
6. Optionally, you can also evaluate the text classifier on each iteration, by checking how it performs on the development data held back from the dataset. This lets you print the precision, recall and F-score.
7. Save the trained model using nlp.to_disk.
8. Test the model to make sure the text classifier works as expected.

In [0]:
import plac
import random
from pathlib import Path
import thinc.extra.datasets

import spacy
from spacy.util import minibatch, compounding

In [0]:
!python -m spacy download en_core_web_md

Collecting en_core_web_md==2.1.0a7 from https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0a7/en_core_web_md-2.1.0a7.tar.gz#egg=en_core_web_md==2.1.0a7
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0a7/en_core_web_md-2.1.0a7.tar.gz (95.4MB)
[K    100% |████████████████████████████████| 95.4MB 2.5MB/s 
[?25hInstalling collected packages: en-core-web-md
  Found existing installation: en-core-web-md 2.0.0
    Uninstalling en-core-web-md-2.0.0:
      Successfully uninstalled en-core-web-md-2.0.0
  Running setup.py install for en-core-web-md ... [?25ldone
[?25hSuccessfully installed en-core-web-md-2.1.0a7
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [0]:
!python -m spacy validate

⠙ Loading compatibility table...[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation: /usr/local/lib/python3.6/dist-packages/spacy[0m

TYPE      NAME                MODEL               VERSION                              
package   en-vectors-web-lg   en_vectors_web_lg   [38;5;2m2.1.0a0[0m   [38;5;2m✔[0m
package   en-core-web-sm      en_core_web_sm      [38;5;1m2.0.0[0m     --> 2.1.0a7   
package   en-core-web-md      en_core_web_md      [38;5;2m2.1.0a7[0m   [38;5;2m✔[0m
package   en-core-web-lg      en_core_web_lg      [38;5;2m2.1.0a7[0m   [38;5;2m✔[0m
link      en                  en_core_web_sm      [38;5;1m2.0.0[0m     --> 2.1.0a7   
link      en_core_web_md      en_core_web_md      [38;5;2m2.1.0a7[0m   [38;5;2m✔[0m
link      en_vectors_web_lg   en_vectors_web_lg   [38;5;2m2.1.0a0[0m   [38;5;2m✔[0m

[1m
Use the following commands to update the model packages:
python -m spacy download en_core_web_sm

You may also want to overw

In [0]:
!python -m spacy download en_vectors_web_lg

Collecting en_vectors_web_lg==2.1.0a0 from https://github.com/explosion/spacy-models/releases/download/en_vectors_web_lg-2.1.0a0/en_vectors_web_lg-2.1.0a0.tar.gz#egg=en_vectors_web_lg==2.1.0a0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_vectors_web_lg-2.1.0a0/en_vectors_web_lg-2.1.0a0.tar.gz (661.8MB)
[K    100% |████████████████████████████████| 661.8MB 123.4MB/s 
[?25hInstalling collected packages: en-vectors-web-lg
  Found existing installation: en-vectors-web-lg 2.0.0
    Uninstalling en-vectors-web-lg-2.0.0:
      Successfully uninstalled en-vectors-web-lg-2.0.0
  Running setup.py install for en-vectors-web-lg ... [?25ldone
[?25hSuccessfully installed en-vectors-web-lg-2.1.0a0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_vectors_web_lg')


### GITHUB EXAMPLE

In [0]:
# SAMPLE DATA LOADING

def load_data(limit=0, split=0.8):
    """Load data from the IMDB dataset."""
    # Partition off part of the train data for evaluation
    train_data, _ = thinc.extra.datasets.imdb()
    random.shuffle(train_datam)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

  
# EVAL FUNC

def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "NEGATIVE":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

  # WRAPPING WHOLE PROCESS AS MAIN
# Key steps are commented

def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None):
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()

    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # KEY: Use or Make "textcat" pipe
    
    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat",
            config={
                "exclusive_classes": True,
                "architecture": "simple_cnn",
            }
        )
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe("textcat")

    # KEY: Add Label
    
    # add label to text classifier
    textcat.add_label("POSITIVE")
    textcat.add_label("NEGATIVE")

    # load the IMDB dataset
    print("Loading IMDB data...")
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data()
    train_texts = train_texts[:n_texts]
    train_cats = train_cats[:n_texts]
    print(
        "Using {} examples ({} training, {} evaluation)".format(
            n_texts, len(train_texts), len(dev_texts)
        )
    )
    train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))

    
    # KEY: Disabling other pipes
    # Note logic of nlp.begin_training()
    # Load model.tok2vec
    # Batching using minibatch
    # nlp.update() is main training func
    # extract params using use_param()
    
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        if init_tok2vec is not None:
            with init_tok2vec.open("rb") as file_:
                textcat.model.tok2vec.from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(4.0, 32.0, 1.001)
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                    losses["textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
            )

    # Test Snippet
    test_text = "This movie sucked"
    doc = nlp(test_text)
    print(test_text, doc.cats)

    if output_dir is not None:
        with nlp.use_params(optimizer.averages):
            nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        print(test_text, doc2.cats)

In [0]:
main() # not ran

### OWN EXAMPLE

In [0]:
# Kaggle Project Example

# Params
n_iter=20
n_texts=2000
init_tok2vec=None

In [0]:
nlp = spacy.load('en_vectors_web_lg')

In [0]:
nlp.pipe_names # assert no built-in textcat pipeline (actually nothing as blank model)

[]

In [0]:
# create pipeline
textcat = nlp.create_pipe("textcat", config={"exclusive_classes": True, "archtecture": "simple_cnn"})

nlp.add_pipe(textcat, last=True) # place at last in pipeline

# add custom label for clf
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")

1

In [0]:
# prepare kaggle text - shuffle and split

kaggle_df = pd.read_csv('kaggle_train.csv')
kaggle_df = kaggle_df.sample(frac=1).reset_index(drop=True)
kaggle_df.head(), kaggle_df.shape[0] * 0.8, kaggle_df.shape[0] * 0.9

In [0]:
train_text, train_cat = kaggle_df['sentences'][:6824].tolist(), kaggle_df['sentiment'][:6824].tolist()
train_cat = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in train_cat]
dev_text, dev_cat = kaggle_df['sentences'][6824:7677].tolist(), kaggle_df['sentiment'][6824:7677].tolist()
dev_cat = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in dev_cat]

In [0]:
# leave test as binary for prediction metrics
test_text, test_cat = kaggle_df['sentences'][7677:].tolist(), kaggle_df['sentiment'][7677:].tolist()

In [0]:
# bundle train as tuple for later
train_data = list(zip(train_text, [{'cats': cat} for cat in train_cat]))

In [0]:
# EVAL func: Precision, Recall, F1-score

def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "NEGATIVE":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

In [0]:
# Main training code
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    
    # DID NOT WORK (due to mismatch parameters between 
    # how .bin was trained and this custom model
    # already pretrained binary vector weight to use 
    # tok2vec_bin = 'spacy-pretrain-polyaxon/lmao-imdb-1k/weights/model832.bin'
    # with open(tok2vec_bin, 'rb') as file_:
    #    textcat.model.tok2vec.from_bytes(file_.read())
    
    print("Training the model...")
    print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
    batch_sizes = compounding(4.0, 32.0, 1.001)
    for i in range(n_iter):
        losses = {}
        # batch up the examples using spaCy's minibatch
        random.shuffle(train_data)
        batches = minibatch(train_data, size=batch_sizes)
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
        with textcat.model.use_params(optimizer.averages):
            # evaluate on the dev data split off in load_data()
            scores = evaluate(nlp.tokenizer, textcat, dev_text, dev_cat)
        print(
            "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                losses["textcat"],
                scores["textcat_p"],
                scores["textcat_r"],
                scores["textcat_f"],
            )
        )

Training the model...
LOSS 	  P  	  R  	  F  
15.183	0.681	0.670	0.675
0.937	0.730	0.693	0.711
0.209	0.749	0.719	0.734
0.069	0.746	0.743	0.745
0.040	0.729	0.736	0.732
0.031	0.726	0.726	0.726
0.028	0.721	0.708	0.714
0.021	0.723	0.722	0.723
0.019	0.719	0.724	0.722
0.015	0.725	0.745	0.735
0.013	0.719	0.750	0.734
0.011	0.720	0.752	0.736
0.011	0.704	0.745	0.724
0.009	0.708	0.738	0.723
0.007	0.717	0.745	0.731
0.007	0.716	0.748	0.731
0.006	0.716	0.750	0.733
0.005	0.711	0.762	0.736
0.004	0.709	0.759	0.733
0.004	0.702	0.745	0.723


In [0]:
# save and load model

with nlp.use_params(optimizer.averages):
    nlp.to_disk('./')

nlp2 = spacy.load('./')

In [0]:
# re-merge test text and cat into dataframe

test_df = pd.DataFrame(list(zip(test_text, test_cat)), columns=['sentences', 'sentiment'])

In [0]:
test_df.head()

Unnamed: 0,sentences,sentiment
0,i weep for the future when a good portion of t...,0
1,"it's a trifle of a movie , with a few laughs s...",0
2,a disoriented but occasionally disarming saga ...,1
3,the film's 45-minute running time stops shy of...,1
4,haneke challenges us to confront the reality o...,1


In [0]:
# sample prediction result

nlp2(test_df.iloc[0, 0]).cats

{'NEGATIVE': 0.0034787263721227646, 'POSITIVE': 0.9965212345123291}

In [0]:
# predict and convert text

test_df['pred'] = test_df['sentences'].apply(lambda text: 1 if nlp2(text).cats.get('POSITIVE') > 0.5 else 0)

In [0]:
from sklearn.metrics import f1_score, accuracy_score

print('Accuracy =======>> {}'.format(accuracy_score(test_df['sentiment'], test_df['pred'])))
print('F1 Score =======>> {}'.format(f1_score(test_df['sentiment'], test_df['pred'])))



### Official: Pretrain Vectors for Textcat

In case needing python3.7

```shell
sudo apt-get update
sudo apt-get install build-essential libpq-dev libssl-dev openssl libffi-dev zlib1g-dev
sudo apt-get install python3-pip python3-dev
sudo apt-get install python3.7
```

In [0]:
# debugger - breakpoint
from pdb import set_trace as bp

In [0]:
# Pretrain experimental in spacy-nightly
!pip install spacy-nightly



In [0]:
"""This script is experimental.

Try pre-training the CNN component of the text categorizer using a cheap
language modelling-like objective. Specifically, we load pre-trained vectors
(from something like word2vec, GloVe, FastText etc), and use the CNN to
predict the tokens' pre-trained vectors. This isn't as easy as it sounds:
we're not merely doing compression here, because heavy dropout is applied,
including over the input words. This means the model must often (50% of the time)
use the context in order to predict the word.

To evaluate the technique, we're pre-training with the 50k texts from the IMDB
corpus, and then training with only 100 labels. Note that it's a bit dirty to
pre-train with the development data, but also not *so* terrible: we're not using
the development labels, after all --- only the unlabelled text.

@plac.annotations(
    width=("Width of CNN layers", "positional", None, int),
    embed_size=("Embedding rows", "positional", None, int),
    pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),
    train_iters=("Number of iterations to pretrain", "option", "tn", int),
    train_examples=("Number of labelled examples", "option", "eg", int),
    vectors_model=("Name or path to vectors model to learn from"),
)

"""
import plac
import random
import spacy
import thinc.extra.datasets
from spacy.util import minibatch, use_gpu, compounding
import tqdm
from spacy._ml import Tok2Vec
from spacy.pipeline import TextCategorizer
import numpy

In [0]:
pretrain_iters=30
train_iters=30
train_examples=1000

In [0]:
# Using md model as base
!python -m spacy download en_core_web_md

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [0]:
!python -m spacy link en_core_web_md en

[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_md -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [0]:
!python -m spacy info

[1m

spaCy version    2.1.0a13                      
Location         /usr/local/lib/python3.6/dist-packages/spacy
Platform         Linux-4.14.79+-x86_64-with-Ubuntu-18.04-bionic
Python version   3.6.7                         
Models           en                            



In [0]:
!python -m spacy validate

⠙ Loading compatibility table...[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation: /usr/local/lib/python3.6/dist-packages/spacy[0m

TYPE      NAME             MODEL            VERSION                              
package   en-core-web-sm   en_core_web_sm   [38;5;1m2.0.0[0m     --> 2.1.0a7   
package   en-core-web-md   en_core_web_md   [38;5;2m2.1.0a7[0m   [38;5;2m✔[0m
link      en               en_core_web_md   [38;5;2m2.1.0a7[0m   [38;5;2m✔[0m

[1m
Use the following commands to update the model packages:
python -m spacy download en_core_web_sm



In [0]:
# Load pretrain data - un-labelled

def load_texts(limit=0):
  train, dev = thinc.extra.datasets.imdb()
  train_texts, train_labels = zip(*train)
  dev_texts, dev_labels = zip(*train)
  train_texts = list(train_texts)
  dev_texts = list(dev_texts)
  random.shuffle(train_texts)
  random.shuffle(dev_texts)
  if limit >= 1:
      return train_texts[:limit]
  else:
      return list(train_texts) + list(dev_texts)

In [0]:
temp_text = load_texts(limit=0)

In [0]:
temp_text[:5]

['Just finished watching this movie for maybe the 7th or 8th time, picked it up one night previously viewed at Blockbuster and absolutely loved it, I\'ve shown it to 4 people so far and they have enjoyed it as well. Avoid of all the Hollywood glamour, special effects and stress on the "shock factor", this independent film by Paul F. Ryan hits the nail on the head in dealing with the after affects of traumatic situations. Taking place after a high school shooting, two characters Alicia (Busy Philipps) and Deanna (Erika Christensen) form an unlikely bond. Alicia, the girl with the stone heart, the Goth who has a pessimistic attitude to life assists Deanna to overcome the issues of life and death and living in the aftermath. Meanwhile Deanna attempts to help Alicia to see some of the softness and light in the world again. Not stressing on the shocking event of the shooting, but on the interpersonal relationships amongst those who survived it sets this movie apart. Despite its low-budget a

In [0]:
# Load Textcat pipe train-dev data - LABELLED 

def load_textcat_data(limit=0):
    """Load data from the IMDB dataset."""
    # Partition off part of the train data for evaluation
    train_data, eval_data = thinc.extra.datasets.imdb()
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    eval_texts, eval_labels = zip(*eval_data)
    cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
    eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels]
    return (texts, cats), (eval_texts, eval_cats)

In [0]:
temp_train, temp_eval = load_textcat_data()

In [0]:
# labels 
temp_train[1][:5]

[{'NEGATIVE': True, 'POSITIVE': False},
 {'NEGATIVE': True, 'POSITIVE': False},
 {'NEGATIVE': False, 'POSITIVE': True},
 {'NEGATIVE': False, 'POSITIVE': True},
 {'NEGATIVE': True, 'POSITIVE': False}]

In [0]:
def prefer_gpu():
    used = spacy.util.use_gpu(0)
    if used is None:
        return False
    else:
        import cupy.random

        cupy.random.seed(0)
        return True

In [0]:
random.seed(0)
numpy.random.seed(0)
use_gpu = prefer_gpu()
print("Using GPU?", use_gpu)

Using GPU? True


In [0]:
# Textcat model construct

def build_textcat_model(tok2vec, nr_class, width):
    from thinc.v2v import Model, Softmax, Maxout
    from thinc.api import flatten_add_lengths, chain
    from thinc.t2v import Pooling, sum_pool, mean_pool, max_pool
    from thinc.misc import Residual, LayerNorm
    from spacy._ml import logistic, zero_init

    with Model.define_operators({">>": chain}):
        model = (
            tok2vec
            >> flatten_add_lengths
            >> Pooling(mean_pool)
            >> Softmax(nr_class, width)
        )
    model.tok2vec = tok2vec
    return model

In [0]:
# Create NLP or model object

def create_pipeline(width, embed_size, vectors_model):
    print("Load vectors")
    nlp = spacy.load(vectors_model)
    print("Start training")
    textcat = TextCategorizer(
        nlp.vocab,
        labels=["POSITIVE", "NEGATIVE"],
        model=build_textcat_model(
            Tok2Vec(width=width, embed_size=embed_size), 2, width
        ),
    )

    nlp.add_pipe(textcat)
    return nlp

In [0]:
nlp = create_pipeline(width=300, embed_size=7500, vectors_model='en')

Load vectors
Start training


In [0]:
nlp

<spacy.lang.en.English at 0x7fa0e5ee5eb8>

In [0]:
# no idea what for this FN
def block_gradients(model):
    from thinc.api import wrap

    def forward(X, drop=0.0):
        Y, _ = model.begin_update(X, drop=drop)
        return Y, None

    return wrap(forward, model)

# Main FN for pretraining "tensorizer" pipeline using texts
def train_tensorizer(nlp, texts, dropout, n_iter):
    tensorizer = nlp.create_pipe("tensorizer")
    nlp.add_pipe(tensorizer)
    optimizer = nlp.begin_training()
    for i in range(n_iter):
        losses = {}
        for i, batch in enumerate(minibatch(tqdm.tqdm(texts))):
            docs = [nlp.make_doc(text) for text in batch]
            tensorizer.update(docs, None, losses=losses, sgd=optimizer, drop=dropout)
        print(losses)
    return optimizer

For GPU support, we're grateful to use the work of Chainer's cupy module, which provides a numpy-compatible interface for GPU arrays. However, installing Chainer when no GPU is available currently causes an error. We therefore do not list Chainer as an explicit dependency — so building Thinc for GPU requires some extra steps:



In [0]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sat_Aug_25_21:08:01_CDT_2018
Cuda compilation tools, release 10.0, V10.0.130


In [0]:
# Seems right version of CuPy needed
pip install cupy-cuda100



In [0]:
# Bunch of CLI for asserting the right CUDA for THINIC GPU implementation
# Optional?? 

!ls /usr/local/cuda -a
!export CUDA_HOME=/usr/local/cuda # Or wherever your CUDA is
!export PATH=$PATH:$CUDA_HOME/bin
!pip install chainer
!python -c "import cupy; assert cupy" # Check it installed
!pip install thinc_gpu_ops thinc # Or `thinc[cuda]`
!python -c "import thinc_gpu_ops" # Check the GPU ops were built

**ERROR**

- CuPy dtype error
  - seems to be Colab env-dep issues
  - But dimension error still occurs 
  - Perhaps due to incorrect dimension or width and embed_size hyperparams 
  - These unknown as not given in GitHub source
  
- Solution
  - Not able to use this snippet
  - Resort to only TextCat training as above without pretrain this way
  - Could still pretrain using CLI model method (see later)

In [0]:
optimizer = train_tensorizer(nlp, temp_text, dropout=0.2, n_iter=pretrain_iters)

  0%|          | 0/50000 [00:00<?, ?it/s]


ValueError: ignored

In [0]:
# in case above failed on "Unsupported dtype object"
# remove "tensorizer" from pipeline
# redo above

nlp.pipeline
nlp.remove_pipe('tensorizer')
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7f12f504f860>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f12f4f5fd68>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7f12f4f5fdc8>),
 ('textcat', <spacy.pipeline.pipes.TextCategorizer at 0x7f12ca155940>)]

In [0]:
def train_textcat(nlp, n_texts, n_iter=10):
    textcat = nlp.get_pipe("textcat")
    tok2vec_weights = textcat.model.tok2vec.to_bytes()
    (train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
    print(
        "Using {} examples ({} training, {} evaluation)".format(
            n_texts, len(train_texts), len(dev_texts)
        )
    )
    train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        textcat.model.tok2vec.from_bytes(tok2vec_weights)
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        for i in range(n_iter):
            losses = {"textcat": 0.0}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(tqdm.tqdm(train_data), size=2)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                    losses["textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
            )

In [0]:
def evaluate_textcat(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8
    fp = 1e-8
    tn = 1e-8
    fn = 1e-8
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

In [0]:
train_textcat(nlp, train_examples, n_iter=train_iters)

Using 1000 examples (1000 training, 25000 evaluation)


  0%|          | 0/1000 [00:00<?, ?it/s]

Training the model...
LOSS 	  P  	  R  	  F  


100%|██████████| 1000/1000 [00:33<00:00, 34.55it/s]
  0%|          | 4/1000 [00:00<00:30, 32.31it/s]

64.268	0.728	0.728	0.728


100%|██████████| 1000/1000 [00:29<00:00, 34.93it/s]
  0%|          | 4/1000 [00:00<00:28, 34.91it/s]

35.027	0.770	0.770	0.770


100%|██████████| 1000/1000 [00:29<00:00, 34.88it/s]
  0%|          | 4/1000 [00:00<00:28, 35.21it/s]

13.984	0.770	0.770	0.770


100%|██████████| 1000/1000 [00:29<00:00, 34.20it/s]
  0%|          | 4/1000 [00:00<00:29, 33.81it/s]

4.842	0.766	0.766	0.766


100%|██████████| 1000/1000 [00:29<00:00, 34.99it/s]
  0%|          | 4/1000 [00:00<00:32, 30.72it/s]

5.686	0.767	0.767	0.767


100%|██████████| 1000/1000 [00:30<00:00, 33.11it/s]
  0%|          | 4/1000 [00:00<00:28, 35.10it/s]

3.089	0.767	0.767	0.767


100%|██████████| 1000/1000 [00:29<00:00, 34.44it/s]
  0%|          | 4/1000 [00:00<00:27, 35.60it/s]

2.700	0.766	0.766	0.766


100%|██████████| 1000/1000 [00:28<00:00, 34.72it/s]
  0%|          | 4/1000 [00:00<00:27, 35.61it/s]

1.260	0.767	0.767	0.767


100%|██████████| 1000/1000 [00:28<00:00, 35.56it/s]
  0%|          | 4/1000 [00:00<00:29, 34.24it/s]

1.039	0.769	0.769	0.769


100%|██████████| 1000/1000 [00:29<00:00, 33.53it/s]
  0%|          | 4/1000 [00:00<00:31, 32.07it/s]

1.055	0.767	0.767	0.767


100%|██████████| 1000/1000 [00:28<00:00, 34.68it/s]
  0%|          | 4/1000 [00:00<00:30, 32.68it/s]

0.283	0.766	0.766	0.766


100%|██████████| 1000/1000 [00:29<00:00, 33.89it/s]
  0%|          | 4/1000 [00:00<00:27, 35.87it/s]

0.133	0.763	0.763	0.763


100%|██████████| 1000/1000 [00:29<00:00, 33.53it/s]
  0%|          | 4/1000 [00:00<00:28, 35.28it/s]

0.000	0.762	0.762	0.762


100%|██████████| 1000/1000 [00:30<00:00, 32.50it/s]
  0%|          | 4/1000 [00:00<00:28, 35.41it/s]

0.532	0.763	0.763	0.763


100%|██████████| 1000/1000 [00:29<00:00, 33.54it/s]
  0%|          | 4/1000 [00:00<00:27, 35.81it/s]

0.530	0.764	0.764	0.764


100%|██████████| 1000/1000 [00:29<00:00, 33.52it/s]
  0%|          | 4/1000 [00:00<00:29, 34.20it/s]

1.864	0.763	0.763	0.763


100%|██████████| 1000/1000 [00:29<00:00, 33.56it/s]
  0%|          | 4/1000 [00:00<00:28, 35.54it/s]

0.946	0.763	0.763	0.763


100%|██████████| 1000/1000 [00:30<00:00, 33.02it/s]
  0%|          | 4/1000 [00:00<00:28, 35.28it/s]

1.888	0.763	0.763	0.763


100%|██████████| 1000/1000 [00:29<00:00, 34.06it/s]
  0%|          | 4/1000 [00:00<00:29, 34.27it/s]

1.233	0.763	0.763	0.763


100%|██████████| 1000/1000 [00:28<00:00, 34.53it/s]
  0%|          | 4/1000 [00:00<00:28, 34.59it/s]

1.355	0.763	0.763	0.763


100%|██████████| 1000/1000 [00:29<00:00, 35.41it/s]
  0%|          | 4/1000 [00:00<00:27, 35.91it/s]

1.162	0.764	0.764	0.764


100%|██████████| 1000/1000 [00:29<00:00, 33.76it/s]
  0%|          | 4/1000 [00:00<00:29, 34.02it/s]

2.624	0.764	0.764	0.764


100%|██████████| 1000/1000 [00:28<00:00, 35.65it/s]
  0%|          | 4/1000 [00:00<00:28, 35.06it/s]

2.251	0.766	0.766	0.766


100%|██████████| 1000/1000 [00:28<00:00, 35.99it/s]
  0%|          | 4/1000 [00:00<00:27, 35.58it/s]

2.028	0.765	0.765	0.765


100%|██████████| 1000/1000 [00:28<00:00, 34.81it/s]
  0%|          | 4/1000 [00:00<00:30, 32.50it/s]

1.879	0.764	0.764	0.764


100%|██████████| 1000/1000 [00:28<00:00, 35.65it/s]
  0%|          | 4/1000 [00:00<00:27, 35.95it/s]

1.419	0.763	0.763	0.763


100%|██████████| 1000/1000 [00:28<00:00, 34.83it/s]
  0%|          | 4/1000 [00:00<00:27, 35.71it/s]

2.702	0.762	0.762	0.762


100%|██████████| 1000/1000 [00:28<00:00, 34.78it/s]
  0%|          | 4/1000 [00:00<00:31, 31.65it/s]

1.412	0.761	0.761	0.761


100%|██████████| 1000/1000 [00:28<00:00, 34.53it/s]
  0%|          | 4/1000 [00:00<00:30, 32.45it/s]

1.339	0.762	0.762	0.762


100%|██████████| 1000/1000 [00:29<00:00, 35.76it/s]


2.509	0.763	0.763	0.763


In [0]:
    # test the trained model
    test_text = "This movie sucked"
    doc = nlp(test_text)
    print(test_text, doc.cats)

KeyError: ignored

# Snippet:  2.0 custom pipelines and extensions

> Available: download, link, info, train, evaluate, convert, package,
    vocab, init-model, profile, validate

In [0]:
import spacy
from spacy.tokens import Doc

In [0]:
Doc.set_extension('is_greeting', default=False)
nlp = spacy.load('en')
doc = nlp(u'hello world')
doc._.doc_extensions

# ._ create extensibility and distinction to built-ins, code-break resilient upon update
doc._.is_greeting = True 

In [0]:
# Customise Processing Pipeline (same nlp() as above)

component = MyComponent() # See below for INIT

nlp.add_pipe(component, after='tagger')

doc = nlp(u'This is a sentence')

**The nlp object is an instance of Language, which contains the data and annotation scheme of the language you're using and a pre-defined pipeline of components, like the tagger, parser and entity recognizer. If you're loading a model, the Language instance also has access to the model's binary data. All of this is specific to each model, and defined in the model's meta.json – for example, a Spanish NER model requires different weights, language data and pipeline components than an English parsing and tagging model. This is also why the pipeline state is always held by the Language class. spacy.load() puts this all together and returns an instance of Language with a pipeline set and access to the binary data.**

```python
doc = nlp.make_doc(u'This is a sentence')   # create a Doc from raw text
for name, proc in nlp.pipeline:             # iterate over components in order
    doc = proc(doc)                         # call each component on the Doc
```

**spaCy 2.0 simply list of (name, function) tuple**

```python
nlp.pipeline
[('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>),
 ('ner', <spacy.pipeline.EntityRecognizer>)]
```

To make it more convenient to modify the pipeline, there are several built-in methods to get, add, replace, rename or remove individual components. spaCy's default pipeline components, like the tagger, parser and entity recognizer now all follow the same, consistent API and are subclasses of `Pipe`. If you're developing your own component, using the Pipe API will make it fully trainable and serializable. At a minimum, a component needs to be a callable that takes a Doc and returns it:

```python
def my_component(doc):
    print("The doc is {} characters long and has {} tokens."
          .format(len(doc.text), len(doc))
    return doc
```

The component can then be added at any position of the pipeline using the `nlp.add_pipe()` method. The arguments `before, after, first, and last` let you specify component names to insert the new component before or after, or tell spaCy to insert it first (i.e. directly after tokenization) or last in the pipeline.

```python
nlp = spacy.load('en')
nlp.add_pipe(my_component, name='print_length', last=True)
doc = nlp(u"This is a sentence.")
```

**Extension attributes on Doc, Token and Span**

When you implement your own pipeline components that modify the `Doc`, you often want to extend the API, so that the information you're adding is conveniently accessible. spaCy v2.0 introduces a new mechanism that lets you register your own attributes, properties and methods that become available in the `._` namespace, for example, `doc._.my_attr`. There are mostly three types of extensions that can be registered via the `set_extension()`` method:
**Why ._?**
Writing to a ._ attribute instead of to the Doc directly keeps a clearer separation and makes it easier to ensure backwards compatibility. For example, if you've implemented your own .coref property and spaCy claims it one day, it'll break your code. Similarly, just by looking at the code, you'll immediately know what's built-in and what's custom – for example, doc.sentiment is spaCy, while doc._.sent_score isn't.

1. Attribute extensions. Set a default value for an attribute, which can be overwritten.
2. Property extensions. Define a `getter` and an optional `setter` function.
3. Method extensions. Assign a function that becomes available as an object method.

```python
Doc.set_extension('hello_attr', default=True)
Doc.set_extension('hello_property', getter=get_value, setter=set_value)
Doc.set_extension('hello_method', method=lambda doc, name: 'Hi {}!'.format(name))

doc._.hello_attr            # True
doc._.hello_property        # return value of get_value
doc._.hello_method('Ines')  # 'Hi Ines!'
```

**WHY Extensions?**

Being able to easily write custom data to the `Doc, Token and Span` means that applications using spaCy can take full advantage of the built-in data structures and the benefits of Doc objects as the **single source of truth** containing all information:

- No information is lost during tokenization and parsing, so you can always relate annotations to the original string.
- The Token and Span are views of the Doc, so they're always up-to-date and consistent.
- Efficient C-level access is available to the underlying TokenC* array via doc.c.
- APIs can standardise on passing around Doc objects, reading and writing from them whenever necessary. Fewer signatures makes functions more reusable and composable.

**TODO - learn these examples of custom componets**

- https://explosion.ai/blog/spacy-v2-pipelines-extensions
- https://github.com/explosion/spaCy/blob/develop/examples/pipeline/custom_component_countries_api.py
- https://github.com/explosion/spaCy/blob/develop/examples/pipeline/custom_component_entities.py
- https://github.com/explosion/spaCy/blob/develop/examples/pipeline/custom_attr_methods.py
- https://github.com/explosion/spaCy/blob/develop/examples/pipeline/custom_sentence_segmentation.py
- https://github.com/explosion/spaCy/blob/develop/examples/pipeline/fix_space_entities.py
- https://github.com/explosion/spaCy/blob/develop/examples/pipeline/multi_processing.py

# Snippet: 2.1 Pretrain Overview (experimental)

Scaling down these language models to the sizes we use in spaCy posed an interesting research challenge. Language models typically use a large output layer, with one neuron per word in the vocabulary. If you're predicting over a 10,000 word vocabulary, this means you're predicting a vector with 10,000 elements. spaCy v2.1's token vectors are 96 elements wide, so a naive softmax approach would be unlikely to work: we'd be trying to predict 100 elements of output for every 1 element of input. We could make the vocabulary somewhat smaller, but every word that's out of vocabulary is a word the pretraining process will be unable to learn. Stepping back a little, the problem of so-called "one hot" representations posing representational issues for neural networks is actually quite familiar. This is exactly what algorithms like word2vec, GloVe and FastText set out to solve. Instead of a binary vector with one dimension per entry in the vocabulary, we can have a much denser real-valued representation of the same information.

> The spacy pretrain command requires a **word vectors model as part of the input**, which it uses as the target output for each token. Instead of predicting a token's ID as a classification problem, we learn to predict the token's word vector. Inspired by names such as ELMo and BERT, we've termed this trick Language Modelling with Approximate Outputs (LMAO). Our first implementation is probably a good way to get acquainted with the idea – it's extremely short.

As is often the case in research, it seems that LMAO is an idea whose time had come. Several other researchers have been working on related ideas independently. So far we've been using L2 loss in our experiments, but Kumar and Tsvetkov (2018), who were simultaneously working on a similar idea for machine translation, have developed a novel probabilistic loss using the von Mises-Fisher distribution, which they show performs significantly better than L2 in their experiments. Even more recently, Li et al. (2019) report experiments using an LMAO objective in place of the softmax layer in the ELMo pretraining system, with promising results. In our own preliminary experiments, we've found pretraining especially effective when limited training data is available. It helps most for text categorization and parsing, but is less effective for named entity recognition. We expect the pretraining to be increasingly important as we add more abstract semantic prediction models to spaCy, for tasks such as semantic role labelling, coreference resolution and named entity linking.

### Official: Pretrain 100,000 Reddit Comments on Core_sm and Core_lg

```shell
# Pretrain for the en_core_web_sm model. The sm model doesn't require the word vectors
# at runtime, while the lg model does.
python -m spacy pretrain /input/reddit-100k.jsonl en_vectors_web_lg /output

# Pretrain for the en_core_web_lg model
python -m spacy pretrain /input/reddit-100k.jsonl en_vectors_web_lg /output --use-vectors
```

We ran both pretraining jobs simultaneously on a Tesla V100, with each task training at around 50,000 tokens per second. We pretrained for 3 billion words (making several passes over the 100k comments), which took around 17 hours. The total cost of both jobs came out to about 40USD on Google Compute Engine. We haven't implemented resume logic yet, which will help decrease the cost of large scale jobs further, as it would allow the use of pre-emptible instances. This would take pretraining costs down to around 4USD per billion words of training. The spacy pretrain command saves out a weights file after each pass over the data. To use the pretrained weights, we can simply pass them as an argument to spacy train


```shell
python -m spacy train en /models/ /corpora/PTB_SD_3_3_0/train.gold.json
/corpora/PTB_SD_3_3_0/dev.gold.json --n-examples 100 --pipeline parser
--init-tok2vec pretrain-nv-model999.bin
```

### Honnibal: How to Use Pretain resulting .bin weights

If you're using the spacy train command, you can pass the trained weights with the -t2v argument. Here's an example command: 

```shell
python -m spacy train en tmp/ ~/data/PTB_SD_3_3_0/train.gold.json ~/data/PTB_SD_3_3_0/dev.gold.json --pipeline parser -t2v nv-cosine-reddit17-99.bin
```

More fundamentally, the weights files being produced by the spacy pretrain command are the result of calling something like **textcat.model.tok2vec.to_bytes().** There's a couple of things to keep in mind about this. Let's say you first create a blank **TextCategorizer** object, like this:

```python
textcat = nlp.create_pipe("textcat")
```

The new textcat object will start out with **textcat.model == True**. So, you won't immediately be able to call something like **textcat.model.tok2vec**. You need to create a model object first, which usually happens during the call to **nlp.begin_training()**. So, somewhere in your script after the call to begin_training(), but before you start making updates, you can add lines like this:

```python
with open("/path/to/model999.bin", "rb") as file_:
    textcat.model.tok2vec.from_bytes(file_.read())
```

NOTE: One more thing to be aware of. You need to make sure that the textcat.model.tok2vec instance you're loading the weights into has the same architecture and hyper-parameters as the model you used during pre-training. With the default textcat hyper-parameters, that's not quite true. **The easiest solution is to pass the flag architecture="simple_cnn".**

```python
# full code

nlp = spacy.blank("en")
textcat = nlp.create_pipe("textcat", config={"architecture": "simple_cnn", "exclusive_classes": True}))
textcat.add_label("LABEL1")
textcat.add_label("LABEL2")
# Alternatively, instead of adding all your labels explicitly, you could pass all your examples
# into textcat.begin_training, like this: textcat.begin_training(get_gold_tuples=lambda: my_data)
# It's fine to add the labels and not pass in the data, though. The nlp.begin_training() method will
# work the same as well, if you have other components in your pipeline you want to train.
optimizer = textcat.begin_training()
# Now that we have our model, we can load in the pretrained weights.
with open(path_to_pretrained_weights, "rb") as file_:
    textcat.model.tok2vec.from_bytes(file_.read())
# Now we can proceed with training
for epoch in range(nr_epoch):
    random.shuffle(train_data)
    for batch in minibatch(train_data, size=batch_size):
        X, y = zip(*batch)
        textcat.update(X, y, sgd=optimizer)
```

**NOTE LIMITATION of LMAO** by Honnibal

> For dialogue generation, I'm actually not sure the spacy pretrain command will perform well. One weakness of the "LMAO" trick we're using is that at each word, the model only gets to predict one vector. Normally, a language model will be predicting a probability distribution over the words in the vocabulary. We're only predicting a single point in vector-space. For the purpose of getting a token representation, I think the LMAO trade-off is pretty good. But if you really do want to generate words, it's probably not so good.

### DEMO: PRETRAIN (CLI)

- See above 2.1 snippet for example

- Input
  - raw text in JOSNL format (see separate note)
  - language model (pure vector.txt may not work?? tutorials are all based on spaCy-style model files)
  - parameters
  
```python
import srsly
data = [{"text": "Some text"}, {"text": "More..."}]
srsly.write_jsonl("/path/to/text.jsonl", data)
```

**HONNIBAL NOTE**
https://github.com/explosion/spaCy/issues/3448#issuecomment-475046413

**Excellent Tutorial**
https://tienduccao.github.io/posts/spacy_pretrain/

**generate_josonl.py** # handy csv-jsonl converter

```python
with open('corpus.jsonl', 'a+') as out:
    with open('dataset.csv') as f:
        for line in f.read().splitlines():
            sentence = line[line.index(',') + 1:]
            sentence = sentence.replace('"', '')
            out.write('{"text": ' + '"' + sentence + '"}\n')
```

In [0]:
# Test using IMDB raw text to pretran "en_core_web_sm"

# First convert text into right JSONL for `spacy pretrain`

# method-1 using above script (need editing)

with open('imdb.jsonl', 'a+') as out:
    with open('imdb_master.csv', encoding='latin-1') as f:
        for line in f.read().splitlines():
            sentence = line[line.index(',') + 1:]
            sentence = sentence.replace('"', '')
            out.write('{"text": ' + '"' + sentence + '"}\n')

In [0]:
# method-2 using spacy srsly
import pandas as pd

imdb_df = pd.read_csv('imdb_master.csv', usecols=[1,2,3], encoding='latin-1')
imdb_df.head()
import srsly
imdb_jsonl = [{"text": sent} for sent in imdb_df['review'].values]
imdb_jsonl[:5]
srsly.write_jsonl("imdb_jsonl.jsonl", imdb_jsonl)

**BELOW PRETRAIN OOM**

- Honnibal explained to a OOM problem

> Unfortunately a single 130kb text could be causing OOM on your card. The vectors are already take 1gb, and the intermediate representations can get pretty big. For instance, a single convolutional layer has to build a matrix of shape (n, 3, 3, 96) where n is the number of words in the batch. There are several of these layers, and then the parser's hidden layer ends up large as well.

> There are lots of places in the forward pass I think I can save memory if the backward pass isn't being run. But currently it can take a surprising amount of temporary memory during parsing. This is especially true on GPU, as the cupy library has its own allocator.

In [0]:
# need nightly
!pip install spacy-nightly

In [0]:
!python -m spacy download en_vectors_web_lg # lg has 1m vectors

Collecting en_vectors_web_lg==2.0.0 from https://github.com/explosion/spacy-models/releases/download/en_vectors_web_lg-2.0.0/en_vectors_web_lg-2.0.0.tar.gz#egg=en_vectors_web_lg==2.0.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_vectors_web_lg-2.0.0/en_vectors_web_lg-2.0.0.tar.gz (661.8MB)
[K    100% |████████████████████████████████| 661.8MB 59.2MB/s 
[?25hInstalling collected packages: en-vectors-web-lg
  Running setup.py install for en-vectors-web-lg ... [?25ldone
[?25hSuccessfully installed en-vectors-web-lg-2.0.0

[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/en_vectors_web_lg -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/en_vectors_web_lg

    You can now load the model via spacy.load('en_vectors_web_lg')



In [0]:
!python -m spacy validate # check version of vector model compatibility to pretrain

In [0]:
!python -m spacy pretrain imdb_jsonl.jsonl en_vectors_web_lg weights 

[38;5;4mℹ Using GPU[0m
[38;5;2m✔ Saved settings to config.json[0m
[2K[38;5;2m✔ Loaded input texts[0m
⠙ Loading model 'en_vectors_web_lg'...tcmalloc: large alloc 1285169152 bytes == 0x27944000 @  0x7fe0eb1711e7 0x7fe0e8cfbe51 0x7fe0e8d65b25 0x7fe0e8d666be 0x7fe0e8dff6ee 0x5030d5 0x507641 0x504c28 0x502540 0x502f3d 0x507641 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x506369 0x7fe058245bc8 0x7fe05825338d 0x5a730c 0x503073 0x506859 0x7fe0582437e0 0x7fe058245cdc 0x7fe0582521c3 0x566103 0x7fe0594f871d 0x5030d5 0x506859 0x504c28 0x502540
⠴ Loading model 'en_vectors_web_lg'...
Traceback (most recent call last):
  File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.6/dist-packages/spacy/__main__.py", line 38, in <module>
    plac.call(commands[command], sys.argv[1:])
  File "/usr/local/lib/python3.6/dist-packages/plac_core.py

# DEMO

**Key demo (Norwagian language model creation) https://github.com/explosion/spaCy/issues/3082**

## TRAIN Language Model
https://spacy.io/usage/training

Flow of Training
- Creating a vocabulary file
  - spaCy expects that common words will be cached in a Vocab instance. The vocabulary caches lexical features. spaCy loads the vocabulary from binary data, in order to keep loading efficient. The easiest way to save out a new binary vocabulary file is to use the spacy init-model command, which expects a JSONL file with words and their lexical attributes. See the docs on the vocab JSONL format for details.
- Training the word vectors
  - Word2vec and related algorithms let you train useful word similarity models from unlabeled text. This is a key part of using deep learning for NLP with limited labeled data. The vectors are also useful by themselves – they power the .similarity methods in spaCy. For best results, you should pre-process the text with spaCy before training the Word2vec model. This ensures your tokenization will match. You can use our word vectors training script, which pre-processes the text with your language-specific tokenizer and trains the model using Gensim. The vectors.bin file should consist of one word and vector per line.
  - https://github.com/explosion/spacy/tree/master/bin/train_word_vectors.py
  - If you don’t have a large sample of text available, you can also convert word vectors produced by a variety of other tools into spaCy’s format. See the docs on converting word vectors for details.
- Creating or converting a training corpus
  - The easiest way to train spaCy’s tagger, parser, entity recognizer or text categorizer is to use the spacy train command-line utility. In order to use this, you’ll need training and evaluation data in the JSON format spaCy expects for training.
  - You can now train the model using a corpus for your language annotated with If your data is in one of the supported formats, the easiest solution might be to use the spacy convert command-line utility. This supports several popular formats, including the IOB format for named entity recognition, the JSONL format produced by our annotation tool Prodigy, and the CoNLL-U format used by the Universal Dependencies corpus.
  - One thing to keep in mind is that spaCy expects to train its models from whole documents, not just single sentences. If your corpus only contains single sentences, spaCy’s models will never learn to expect multi-sentence documents, leading to low performance on real text. To mitigate this problem, you can use the -N argument to the spacy convert command, to merge some of the sentences into longer pseudo-documents.
- Training the tagger and parser
  - Once you have your training and evaluation data in the format spaCy expects, you can train your model use the using spaCy’s train command. Note that training statistical models still involves a degree of trial-and-error. You may need to tune one or more settings, also called “hyper-parameters”, to achieve optimal performance. See the usage guide on training for more details.
  


1. From scratch 
2. Update on existing model


> Both can be preceded by **Pretrain**

### (1) From Scratch (CLI or Code)

**CLI method**
- Input
  - **Annotated format - supports several popular formats, including the IOB format for named entity recognition, the JSONL format produced by our annotation tool Prodigy, and the CoNLL-U format used by the Universal Dependencies corpus.**
  - `spacy convert` into spaCy JSON format
- Example:

```shell
git clone https://github.com/UniversalDependencies/UD_Spanish-AnCora
mkdir ancora-json
python -m spacy convert UD_Spanish-AnCora/es_ancora-ud-train.conllu ancora-json
python -m spacy convert UD_Spanish-AnCora/es_ancora-ud-dev.conllu ancora-json
mkdir models
python -m spacy train es models ancora-json/es_ancora-ud-train.json ancora-json/es_ancora-ud-dev.json
```

**Simple code method (Preferred)**

> Instead of sequences of `Doc and GoldParse` objects, you can also use the “simple training style” and **pass raw texts and dictionaries of annotations to nlp.update.** The dictionaries can have the **keys entities, heads, deps, tags and cats.** This is generally recommended, as it removes one layer of abstraction, and avoids unnecessary imports. It also makes it easier to structure and load your training data.

- Example Annotations

```json
{
   "entities": [(0, 4, "ORG")],
   "heads": [1, 1, 1, 5, 5, 2, 7, 5],
   "deps": ["nsubj", "ROOT", "prt", "quantmod", "compound", "pobj", "det", "npadvmod"],
   "tags": ["PROPN", "VERB", "ADP", "SYM", "NUM", "NUM", "DET", "NOUN"],
   "cats": {"BUSINESS": 1.0},
}
```

- Simple Training Loop

```python
TRAIN_DATA = [
        (u"Uber blew through $1 million a week", {"entities": [(0, 4, "ORG")]}),
        (u"Google rebrands its business apps", {"entities": [(0, 6, "ORG")]})]

nlp = spacy.blank('en')
optimizer = nlp.begin_training()
for i in range(20):
    random.shuffle(TRAIN_DATA)
    for text, annotations in TRAIN_DATA:
        nlp.update([text], [annotations], sgd=optimizer)
nlp.to_disk("/model")
```

> The above training loop leaves out a few details that can really improve accuracy – but the principle really is that simple. Once you’ve got your pipeline together and you want to tune the accuracy, you usually want to process your training examples in batches, and experiment with minibatch sizes and dropout rates, set via the drop keyword argument. See the Language and Pipe API docs for available options.





#### NER



**(1) BUILT-IN ENTITY**

**Blank Model or Load Built-in**

**(2) CUSTOM ENTITY**

**Training an additional entity type** \

> **In practice, you’ll need many more — a few hundred would be a good start. You will also likely need to mix in examples of other entity types, which might be obtained by running the entity recognizer over unlabelled sentences, and adding their annotations to the training set.**

In [0]:
# Training text

# Sample article from BBC

from goose3 import Goose

In [0]:
g = Goose()

In [0]:
article = g.extract(url='https://www.bbc.com/news/world-africa-47997729')

In [0]:
clean_text = article.cleaned_text

In [0]:
# Using sentencizer component (rule-based matcher)

sent_nlp = spacy.load('en_core_web_sm')
sentencizer = sent_nlp.create_pipe('sentencizer')
sent_nlp.add_pipe(sentencizer)
sent_doc = sent_nlp(clean_text)
sent_text = [sent for sent in sent_doc.sents]

In [34]:
sent_text[0].text

"A large hoard of cash has been found at the home of Sudan's ousted president Omar al-Bashir and he is now being investigated for money laundering, prosecutors say."

In [0]:
sent1 = sent_text[0].text

In [72]:
# finding loc of words
sent1.find('Sudan') # start 
print()
sent1.find('Sudan') + len('Sudan') # end
print()

# finding loc of words
sent1.find('Omar al-Bashir') # start 
print()
sent1.find('Omar al-Bashir') + len('Omar al-Bashir') # end
print()

# finding loc of words
sent1.find('cash') # start 
print()
sent1.find('cash') + len('cash') # end
print()

# finding loc of words
sent1.find('money') # start 
print()
sent1.find('money') + len('money') # end

52




57




77




91




17




21




129




134

In [0]:
sent2 = sent_text[4].text

In [49]:
# finding loc of words
sent2.find('Bashir') # start 
print()
sent2.find('Bashir') + len('Bashir') # end

250




256

In [0]:
sent3 = sent_text[6].text

In [68]:
# finding loc of words
sent3.find('Dabanga') # start 
print()
sent3.find('Dabanga') + len('Dabanga') # end
print()

# finding loc of words
sent3.find('cash') # start 
print()
sent3.find('cash') + len('cash') # end

64




71




154




158

In [0]:
sent4 = sent_text[-8].text

In [77]:
# finding loc of words
sent4.find('Bashir') # start 
print()
sent4.find('Bashir') + len('Bashir') # end
print()

# finding loc of words
sent4.find('cash') # start 
print()
sent4.find('cash') + len('cash') # end
print()

# finding loc of words
sent4.find('money') # start 
print()
sent4.find('money') + len('money') # end

55




61




111




115




88




93

In [0]:
sent5 = sent_text[7].text

In [76]:
# finding loc of words
sent5.find('Dabanga') # start 
print()
sent5.find('Dabanga') + len('Dabanga') # end
print()

# finding loc of words
sent5.find('money') # start 
print()
sent5.find('money') + len('money') # end

25




32




6




11

In [0]:
TRAIN_DATA = [
    # instead u'raw text' using sent1 made above
    (sent1, {'entities': [
        (17, 21, 'MONEY'),
        (52, 57, 'LOC'),
        (77, 91, 'PERSON'),
        (129, 134, 'MONEY')
    ]}),
    (sent2, {'entities': [
        (250, 256, 'PERSON')
    ]}),
    (sent3, {'entities': [
        (64, 71, 'ORG'),
        (154, 158, 'MONEY')
    ]}),
    (sent4, {'entities': [
        (55, 61, 'PERSON'),
        (88, 93, 'MONEY'),
        (111, 115, 'MONEY')
    ]}),
    (sent5, {'entities': [
        (6, 11, 'MONEY'),
        (25, 32, 'ORG')
    ]}),
]

In [0]:
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

In [0]:
def main(TRAIN_DATA, new_model_name, model=None, output_dir=None, n_iter=100):
  
    """Load the model, set up the pipeline and train the entity recognizer."""
    random.seed(0)
    
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")


    # loop add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
            
    # add separately
    # ner.add_label(LABEL)
    # Adding extraneous labels shouldn't mess anything up
    # ner.add_label("VEGETABLE")        
    
    # RESET & INIT weights randomly ONLY if training New Model
    if model is None:
        optimizer = nlp.begin_training()
    #else:
        #optimizer = nlp.resume_training() # only added V2.1 so if 2.0 delete this line
    move_names = list(ner.move_names) # for asserting consistency later

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    
    with nlp.disable_pipes(*other_pipes):  # only train NER
      sizes = compounding(1.0, 4.0, 1.001) # make compounding iterator
      for itn in range(n_iter):
          random.shuffle(TRAIN_DATA)
          losses = {}
          # batch up the examples using spaCy's minibatch
          batches = minibatch(TRAIN_DATA, size=sizes)
          for batch in batches:
              texts, annotations = zip(*batch)
              nlp.update(
                  texts,  # batch of texts
                  annotations,  # batch of annotations
                  # sgd=optimizer, # use new or resumed weights (not used here given 2.0)
                  drop=0.5,  # dropout - make it harder to memorise data
                  losses=losses,
              )
          print("Losses", losses)

    #return nlp



    # OPTIONAL (if not returning nlp)
    
    # TEST the trained model (same text??)
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # SAVE and LOAD
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta["name"] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)


        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        # Check the classes have loaded back consistently
        assert nlp2.get_pipe("ner").move_names == move_names
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

In [91]:
# test run without returning nlp (auto-save-load-model and test on same text)

main(TRAIN_DATA, new_model_name='money', model='en_core_web_sm', output_dir='./test_NER', n_iter=100)

Loaded model 'en_core_web_sm'
Losses {'ner': 38.339719703242835}
Losses {'ner': 28.49583713815015}
Losses {'ner': 25.69930442805708}
Losses {'ner': 23.29914193543904}
Losses {'ner': 23.915318146356157}
Losses {'ner': 13.630135039003822}
Losses {'ner': 13.909836814990843}
Losses {'ner': 11.130936764457838}
Losses {'ner': 8.164399355363376}
Losses {'ner': 7.844817149682352}
Losses {'ner': 8.527891957652677}
Losses {'ner': 3.7605356525216154}
Losses {'ner': 2.2046004479703805}
Losses {'ner': 0.6606404264126979}
Losses {'ner': 2.087620396172873}
Losses {'ner': 0.5788417879586264}
Losses {'ner': 1.4673347812180273}
Losses {'ner': 1.6129887940178806}
Losses {'ner': 2.5845509727339437}
Losses {'ner': 1.3948534087868443}
Losses {'ner': 2.5414286015773406}
Losses {'ner': 0.0008206936510949034}
Losses {'ner': 1.334384932249169}
Losses {'ner': 0.0755024031798681}
Losses {'ner': 0.2669488861907792}
Losses {'ner': 2.0548288943347335}
Losses {'ner': 1.8820700421148364}
Losses {'ner': 0.4520683824751