In [1]:
import plac
import random
from pathlib import Path
import thinc.extra.datasets

import spacy
from spacy.util import minibatch, compounding

In [20]:

def load_data(limit=0, split=0.8, train_data=None):
    """Load data from the IMDB dataset."""
    # Partition off part of the train data for evaluation
    if train_data is None:
        print('using imdb')
        train_data, _ = thinc.extra.datasets.imdb()

    random.shuffle(train_data)
    
    print(train_data[0])
    
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{'POSITIVE': bool(y)} for y in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])


def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0   # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0   # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}

In [9]:
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=10)


("the movie sucked, it wasn't funny, it wasn't exciting. they tried to make it so bad that it would be good, but failed. and thinking it's cool to like this movie, next to the hype, are the only reasons that this movie is a success...\n\n\n\nthe fact that at this moment 50% voted a 10 out of 10 for this movie seems pretty concerning to me, either the movie going public is going insane or this vote is unrealistic which can have numerous causes, and should be dealt with. anyway it is a less than average movie which bloomed through mouth to mouth advertising. It's success can only be described as a marketing marvel.", 0)


In [50]:

with open('autocom/Definition.txt') as f:
    definition = f.readlines()

# format: 
# - positive: ('some text', 1)
# - negative: ('some text', 0)

_train_data = [(text, 1) for text in definition]

# Definition.txt  Ergebnis.txt  Rechtsansicht.txt Streitbezogen.txt  Subsumtion.txt  Tatbestand.txt
negative_files = 'Ergebnis.txt,Rechtsansicht.txt,Streitbezogen.txt,Subsumtion.txt,Tatbestand.txt'.split(',')

for fn in negative_files:
    with open('autocom/' + fn) as f:
        rows = f.readlines()
        
        _train_data.extend([(text, 0) for text in rows])
    


(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=0, train_data=_train_data)

(' Zudem legt das Gericht dem Arzt einseitig die Beweislast für den Vorgang des Aushandelns auf, obwohl es keine Möglichkeit zu vertraglicher Fixierung des Vorgangs gibt.\n', 0)


In [51]:
train_cats

[{'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': True},
 {'POSITIVE': True},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': True},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': True},
 {'POSITIVE': True},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': True},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': True},
 {'POSITIVE': True},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': False},
 {'POSITIVE': True},
 {'POSITIVE': False

In [49]:
dev_cats

[{'POSITIVE': False}, {'POSITIVE': False}]

In [31]:
dev_texts

('noticed over the years that when a rock star makes his final album before his death',
 'voted a 10 out of 10 for this movie seems pretty concerning to me')

In [25]:
n_iter=20
nlp = spacy.load('de_core_news_sm')

In [53]:
# add the text classifier to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'textcat' not in nlp.pipe_names:
    textcat = nlp.create_pipe('textcat')
    nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:
    textcat = nlp.get_pipe('textcat')

# add label to text classifier
textcat.add_label('POSITIVE')

train_data = list(zip(train_texts,
                      [{'cats': cats} for cats in train_cats]))

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    print("Training the model...")
    print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
    for i in range(n_iter):
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
        #continue
        with textcat.model.use_params(optimizer.averages):
            # evaluate on the dev data split off in load_data()
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
              .format(losses['textcat'], scores['textcat_p'],
                      scores['textcat_r'], scores['textcat_f']))


Training the model...
LOSS 	  P  	  R  	  F  
9.599	0.733	0.478	0.579
7.579	0.619	0.565	0.591
2.638	0.684	0.565	0.619
1.214	0.667	0.609	0.636
1.089	0.565	0.565	0.565
1.146	0.643	0.391	0.486
0.579	0.714	0.435	0.541
1.265	0.714	0.435	0.541
1.164	0.611	0.478	0.537
0.145	0.571	0.522	0.545
0.866	0.579	0.478	0.524
1.045	0.650	0.565	0.605
0.172	0.650	0.565	0.605
0.094	0.722	0.565	0.634
0.014	0.722	0.565	0.634
0.006	0.687	0.478	0.564
0.218	0.737	0.609	0.667
0.006	0.700	0.609	0.651
0.007	0.667	0.522	0.585
0.012	0.684	0.565	0.619
This movie sucked {'POSITIVE': 4.539787187241018e-05}


In [54]:
test_text = "Das ist nur der Fall, wenn die Beweiswürdigung des Strafgerichts den Gesetzen der Logik oder allgemein anerkannten Erfahrungssätzen widerspricht."
doc = nlp(test_text)
print(test_text, doc.cats)

Das ist nur der Fall, wenn die Beweiswürdigung des Strafgerichts den Gesetzen der Logik oder allgemein anerkannten Erfahrungssätzen widerspricht. {'POSITIVE': 0.9998307228088379}


In [55]:
test_text = "Die Klägerin trägt die Kosten des Verfahrens."
doc = nlp(test_text)
print(test_text, doc.cats)

Die Klägerin trägt die Kosten des Verfahrens. {'POSITIVE': 4.539787187241018e-05}


In [56]:
output_dir = 'autocom_model'

if output_dir is not None:
    with nlp.use_params(optimizer.averages):
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

Saved model to autocom_model
