# Train spacy text classifier

In [5]:
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

In [6]:
# Change path to your data dir
data_dir = '../data/'

In [7]:

def load_data(train_data, limit=0, split=0.8):
    """Load data from the IMDB dataset."""
    # Partition off part of the train data for evaluation
    #if train_data is None:
    #    print('using imdb')
    #    train_data, _ = thinc.extra.datasets.imdb()

    random.shuffle(train_data)
    
    print(train_data[0])
    
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{'POSITIVE': bool(y)} for y in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])


def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0   # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0   # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}

In [8]:
from spacy.lang.de.stop_words import STOP_WORDS

In [9]:
# Build training data

# format: 
# - positive: ('some text', 1)
# - negative: ('some text', 0)

with open('Definition.txt') as f:
    definition = f.readlines()

_train_data = [(text, 1) for text in definition]

# Definition.txt  Ergebnis.txt  Rechtsansicht.txt Streitbezogen.txt  Subsumtion.txt  Tatbestand.txt
negative_files = 'Ergebnis.txt,Rechtsansicht.txt,Streitbezogen.txt,Subsumtion.txt,Tatbestand.txt'.split(',')

for fn in negative_files:
    with open(fn) as f:
        rows = f.readlines()
        
        _train_data.extend([(text, 0) for text in rows])

(train_texts, train_cats), (dev_texts, dev_cats) = load_data(_train_data, limit=0)

('Mit diesem Bebauungsplanauszug stimmt die dem Verwaltungsgericht vorgelegte, mit dem Vermerk „überholter Stand“ bezeichnete „(teilweise schlecht bzw. nicht lesbare) Kopie des Bebauungsplans“ (vgl. S. 10 der Beschlussgründe) jedenfalls insoweit nicht überein, als auf ihr unter „Zeichenerklärung“ für das streitgegenständliche Bauquartier die Zeile „28°-32°, Traufhöhe talseits bis 6,00 m“ fehlt.\n', 0)


In [51]:
train_cats

[{'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': True},
 {'POSITIVE': True},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': True},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': True},
 {'POSITIVE': True},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': True},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': True},
 {'POSITIVE': True},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': True},
 {'POSITIVE': False

In [49]:
dev_cats

[{'POSITIVE': False}, {'POSITIVE': False}]

In [31]:
dev_texts

('noticed over the years that when a rock star makes his final album before his death',
 'voted a 10 out of 10 for this movie seems pretty concerning to me')

In [10]:
n_iter=20
nlp = spacy.load('de_core_news_sm')

In [11]:
# add the text classifier to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'textcat' not in nlp.pipe_names:
    textcat = nlp.create_pipe('textcat')
    nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:
    textcat = nlp.get_pipe('textcat')

# add label to text classifier
textcat.add_label('POSITIVE')

train_data = list(zip(train_texts,
                      [{'cats': cats} for cats in train_cats]))

print(nlp.pipe_names)

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    print("Training the model...")
    print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
    for i in range(n_iter):
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
        #continue
        with textcat.model.use_params(optimizer.averages):
            # evaluate on the dev data split off in load_data()
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
              .format(losses['textcat'], scores['textcat_p'],
                      scores['textcat_r'], scores['textcat_f']))


['tagger', 'parser', 'ner', 'textcat']
Training the model...
LOSS 	  P  	  R  	  F  


ZeroDivisionError: float division by zero

In [54]:
test_text = "Das ist nur der Fall, wenn die Beweiswürdigung des Strafgerichts den Gesetzen der Logik oder allgemein anerkannten Erfahrungssätzen widerspricht."
doc = nlp(test_text)
print(test_text, doc.cats)

Das ist nur der Fall, wenn die Beweiswürdigung des Strafgerichts den Gesetzen der Logik oder allgemein anerkannten Erfahrungssätzen widerspricht. {'POSITIVE': 0.9998307228088379}


In [55]:
test_text = "Die Klägerin trägt die Kosten des Verfahrens."
doc = nlp(test_text)
print(test_text, doc.cats)

Die Klägerin trägt die Kosten des Verfahrens. {'POSITIVE': 4.539787187241018e-05}


In [56]:
output_dir = 'autocom_model'

if output_dir is not None:
    with nlp.use_params(optimizer.averages):
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

Saved model to autocom_model
