In [1]:
import spacy
nlp = spacy.blank("en")
doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token)

Captain
america
ate
100
$
of
samosa
.
Then
he
said
I
can
do
this
all
day
.


In [2]:
nlp.pipe_names

[]

In [3]:
nlp = spacy.load("en_core_web_sm")
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [4]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7ed35cfc1360>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7ed35cfc28c0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7ed35d1fd4d0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7ed35d018280>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7ed35e1f9dc0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7ed35d1fd690>)]

In [5]:
doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
  print(token, " | ", spacy.explain(token.pos_)," | ", token.lemma_)

Captain  |  proper noun  |  Captain
america  |  proper noun  |  america
ate  |  verb  |  eat
100  |  numeral  |  100
$  |  numeral  |  $
of  |  adposition  |  of
samosa  |  proper noun  |  samosa
.  |  punctuation  |  .
Then  |  adverb  |  then
he  |  pronoun  |  he
said  |  verb  |  say
I  |  pronoun  |  I
can  |  auxiliary  |  can
do  |  verb  |  do
this  |  pronoun  |  this
all  |  determiner  |  all
day  |  noun  |  day
.  |  punctuation  |  .


In [6]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for ent in doc.ents:
  print(ent.text, ent.label_)

Tesla Inc ORG
$45 billion MONEY


In [7]:
from spacy import displacy

displacy.render(doc, style = "ent")

In [10]:
!python -m spacy download fr_core_news_sm


Collecting fr-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m76.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [11]:
nlp = spacy.load("fr_core_news_sm")

doc = nlp("Tesla Inc va racheter Twitter pour $45 milliards de dollars")

for ent in doc.ents:
  print(ent.text," | ", ent.label_," | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
Twitter  |  MISC  |  Miscellaneous entities, e.g. events, nationalities, products or works of art


In [13]:
for d in doc:
  print(d," | ", token.pos_, " | ", token.lemma_)

Tesla  |  PUNCT  |  .
Inc  |  PUNCT  |  .
va  |  PUNCT  |  .
racheter  |  PUNCT  |  .
Twitter  |  PUNCT  |  .
pour  |  PUNCT  |  .
$  |  PUNCT  |  .
45  |  PUNCT  |  .
milliards  |  PUNCT  |  .
de  |  PUNCT  |  .
dollars  |  PUNCT  |  .


In [15]:
sourc_nlp = spacy.load("en_core_web_sm")
nlp = spacy.blank("en")
nlp.add_pipe("ner", source = sourc_nlp)
nlp.pipe_names

['ner']

In [16]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for ent in doc.ents:
  print(ent.text, ent.label_)

Tesla Inc ORG
$45 billion MONEY


In [17]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/,
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''


In [19]:
doc = nlp(text)
data_websites = [token.text for token in doc if token.like_url]
data_websites

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

In [20]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"
doc = nlp(transactions)
for token in doc:
  if token.like_num and doc[token.i+1].is_currency:
    print(token.text, doc[token.i+1].text)

two $
500 €
