In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')  

In [3]:
doc = nlp(u'This is a sentence.')

In [4]:
doc

This is a sentence.

In [5]:
# Process whole documents
text = (u"When Sebastian Thrun started working on self-driving cars at "
        u"Google in 2007, few people outside of the company took him "
        u"seriously. “I can tell you very senior CEOs of major American "
        u"car companies would shake my hand and turn away because I wasn’t "
        u"worth talking to,” said Thrun, now the co-founder and CEO of "
        u"online higher education startup Udacity, in an interview with "
        u"Recode earlier this week.")
doc = nlp(text)

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

# Determine semantic similarities
doc1 = nlp(u"my fries were super gross")
doc2 = nlp(u"such disgusting fries")
similarity = doc1.similarity(doc2)
print(doc1.text, doc2.text, similarity)

Sebastian Thrun PERSON
Google ORG
2007 DATE
American NORP
Thrun PERSON
Recode ORG
earlier this week DATE
my fries were super gross such disgusting fries 0.7139702518721635


In [1]:
#!/usr/bin/env python
# coding: utf8
"""Train a convolutional neural network text classifier on the
IMDB dataset, using the TextCategorizer component. The dataset will be loaded
automatically via Thinc's built-in dataset loader. The model is added to
spacy.pipeline, and predictions are available via `doc.cats`. For more details,
see the documentation:
* Training: https://spacy.io/usage/training

Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import thinc.extra.datasets

import spacy
from spacy.util import minibatch, compounding


In [2]:
nlp = spacy.blank('en')  # create blank Language class
print("Created blank 'en' model")

Created blank 'en' model


In [3]:
def load_data(limit=0, split=0.8):
    """Load data from the IMDB dataset."""
    # Partition off part of the train data for evaluation
    train_data, _ = thinc.extra.datasets.imdb()
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{'POSITIVE': bool(y)} for y in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

In [5]:
n_texts = 2000
if 'textcat' not in nlp.pipe_names:
        textcat = nlp.create_pipe('textcat')
        nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:
    textcat = nlp.get_pipe('textcat')

 # add label to text classifier
textcat.add_label('POSITIVE')

# load the IMDB dataset
print("Loading IMDB data...")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
print("Using {} examples ({} training, {} evaluation)"
        .format(n_texts, len(train_texts), len(dev_texts)))
train_data = list(zip(train_texts,
                        [{'cats': cats} for cats in train_cats]))

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']

Loading IMDB data...


NameError: name 'n_texts' is not defined

In [8]:
print("Using {} examples ({} training, {} evaluation)"
        .format(n_texts, len(train_texts), len(dev_texts)))
train_data = list(zip(train_texts,
                        [{'cats': cats} for cats in train_cats]))

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']

Using 2000 examples (1600 training, 400 evaluation)


In [14]:
train_data[210]

("I was so entertained throughout this insightful documentary, and I waited a good while for this to come through the pipes (my local video chain), and it was worth the wait. I like a good documentary / special interest piece, but this was definitely a heartfelt, honest, and nostalgic, if you will, look back on adolescent life. The imagination of a child is fascinating, and that's where a great story begins. Rent it or buy it if you like a good, humorous, and all around entertaining documentary. Mr. Stein and company have definitely come a long way from neighborhood Video CamCorder productions of bank hold-ups, and gay-rings that turn people gay from one glance. They all seem rather successful in they're respectful fields, and it was good to know that they are all still good friends. The DVD has a few extra trailers for other good documentaries, and it features a number of Darren's most notable productions, including, Crazy News.",
 {'cats': {'POSITIVE': True}})

In [15]:
nlp.pipeline

[('textcat', <spacy.pipeline.TextCategorizer at 0x1f096a15eb8>)]

In [16]:
nlp.pipe_names

['textcat']