In [2]:
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import sys
from pathlib import Path

import pandas as pd

import spacy
from spacy import displacy
from textacy.extract import ngrams, entities

In [3]:
DATA_DIR = Path('data')

In [4]:
# after spacy V3, not required "simlink". directly using is required.
nlp = spacy.load('en_core_web_sm')
type(nlp)

spacy.lang.en.English

In [5]:
spacy.info('en_core_web_sm')

{'lang': 'en',
 'name': 'core_web_sm',
 'version': '3.4.1',
 'description': 'English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.',
 'author': 'Explosion',
 'email': 'contact@explosion.ai',
 'url': 'https://explosion.ai',
 'license': 'MIT',
 'spacy_version': '>=3.4.0,<3.5.0',
 'spacy_git_version': '2b5f955c2',
 'vectors': {'width': 0, 'vectors': 0, 'keys': 0, 'name': None},
 'labels': {'tok2vec': [],
  'tagger': ['$',
   "''",
   ',',
   '-LRB-',
   '-RRB-',
   '.',
   ':',
   'ADD',
   'AFX',
   'CC',
   'CD',
   'DT',
   'EX',
   'FW',
   'HYPH',
   'IN',
   'JJ',
   'JJR',
   'JJS',
   'LS',
   'MD',
   'NFP',
   'NN',
   'NNP',
   'NNPS',
   'NNS',
   'PDT',
   'POS',
   'PRP',
   'PRP$',
   'RB',
   'RBR',
   'RBS',
   'RP',
   'SYM',
   'TO',
   'UH',
   'VB',
   'VBD',
   'VBG',
   'VBN',
   'VBP',
   'VBZ',
   'WDT',
   'WP',
   'WP$',
   'WRB',
   'XX',
   '_SP',
   '``'],
  'parser': ['ROOT',
   'acl',
   'acomp',


In [6]:
def get_attributes(f):
    print([a for a in dir(f) if not a.startswith('_')], end=' ')

get_attributes(nlp)

['Defaults', 'add_pipe', 'analyze_pipes', 'batch_size', 'begin_training', 'component', 'component_names', 'components', 'config', 'create_optimizer', 'create_pipe', 'create_pipe_from_source', 'default_config', 'default_error_handler', 'disable_pipe', 'disable_pipes', 'disabled', 'enable_pipe', 'evaluate', 'factories', 'factory', 'factory_names', 'from_bytes', 'from_config', 'from_disk', 'get_factory_meta', 'get_factory_name', 'get_pipe', 'get_pipe_config', 'get_pipe_meta', 'has_factory', 'has_pipe', 'initialize', 'lang', 'make_doc', 'max_length', 'meta', 'path', 'pipe', 'pipe_factories', 'pipe_labels', 'pipe_names', 'pipeline', 'rehearse', 'remove_pipe', 'rename_pipe', 'replace_listeners', 'replace_pipe', 'resume_training', 'select_pipes', 'set_error_handler', 'set_factory_meta', 'to_bytes', 'to_disk', 'tokenizer', 'update', 'use_params', 'vocab'] 

In [17]:
sample_text = 'Welcome to a new reality where nothing is real, writes WSJ columnist Joanna Stern. Adobe, Google and others have recently launched AI editing tools that can create high-quality artificial photos almost instantly—though some still have tell-tale mistakes like deformed hands. Adobe’s technology adds a “content credential” to indicate manipulation, but our ability to spot real photos might depend on the cooperation of the entire internet.'
doc = nlp(sample_text)

get_attributes(doc)

['cats', 'char_span', 'copy', 'count_by', 'doc', 'ents', 'extend_tensor', 'from_array', 'from_bytes', 'from_dict', 'from_disk', 'from_docs', 'from_json', 'get_extension', 'get_lca_matrix', 'has_annotation', 'has_extension', 'has_unknown_spaces', 'has_vector', 'is_nered', 'is_parsed', 'is_sentenced', 'is_tagged', 'lang', 'lang_', 'mem', 'noun_chunks', 'noun_chunks_iterator', 'remove_extension', 'retokenize', 'sentiment', 'sents', 'set_ents', 'set_extension', 'similarity', 'spans', 'tensor', 'text', 'text_with_ws', 'to_array', 'to_bytes', 'to_dict', 'to_disk', 'to_json', 'to_utf8_array', 'user_data', 'user_hooks', 'user_span_hooks', 'user_token_hooks', 'vector', 'vector_norm', 'vocab'] 

In [18]:
doc.is_parsed

True

In [19]:
doc.is_sentenced, doc.is_tagged, doc.text

(True,
 True,
 'Welcome to a new reality where nothing is real, writes WSJ columnist Joanna Stern. Adobe, Google and others have recently launched AI editing tools that can create high-quality artificial photos almost instantly—though some still have tell-tale mistakes like deformed hands. Adobe’s technology adds a “content credential” to indicate manipulation, but our ability to spot real photos might depend on the cooperation of the entire internet.')

In [20]:
get_attributes(doc.vocab)

['add_flag', 'cfg', 'deduplicate_vectors', 'from_bytes', 'from_disk', 'get_noun_chunks', 'get_vector', 'has_vector', 'lang', 'length', 'lex_attr_getters', 'lookups', 'morphology', 'prune_vectors', 'reset_vectors', 'set_vector', 'strings', 'to_bytes', 'to_disk', 'vectors', 'vectors_length', 'writing_system'] 

In [21]:
doc.vocab.length

827

In [22]:
pd.Series([token.text for token in doc])

0      Welcome
1           to
2            a
3          new
4      reality
        ...   
75          of
76         the
77      entire
78    internet
79           .
Length: 80, dtype: object

In [23]:
pd.DataFrame([[t.text, t.lemma_, t.pos_, t.tag_, t.dep_, t.shape_, t.is_alpha, t.is_stop] for t in doc],
             columns=['text', 'lemma', 'pos', 'tag', 'dep', 'shape', 'is_alpha', 'is_stop'])

Unnamed: 0,text,lemma,pos,tag,dep,shape,is_alpha,is_stop
0,Welcome,welcome,VERB,VBP,ccomp,Xxxxx,True,False
1,to,to,ADP,IN,prep,xx,True,True
2,a,a,DET,DT,det,x,True,True
3,new,new,ADJ,JJ,amod,xxx,True,False
4,reality,reality,NOUN,NN,pobj,xxxx,True,False
...,...,...,...,...,...,...,...,...
75,of,of,ADP,IN,prep,xx,True,True
76,the,the,DET,DT,det,xxx,True,True
77,entire,entire,ADJ,JJ,amod,xxxx,True,False
78,internet,internet,NOUN,NN,pobj,xxxx,True,False


In [24]:
# Visualize POS Dependencies
options = {'compact': True, 'bg': 'white', 'color': 'black', 'font': 'Source Sans Pro', 'notebook': True}
displacy.render(doc, style='dep', options=options)

In [26]:
# Visualize Named Entities
displacy.render(doc, style='ent', jupyter=True)

In [16]:
# pathlib.glob: get all existing files (of any kind, including directories) matching the given relative pattern
files = (DATA_DIR / 'bbc').glob('**/*.txt')
bbc_articles = []
for i, file in enumerate(sorted(list(files))):
    with file.open(encoding='latin1') as f:
        lines = f.readlines()
        body = ' '.join([l.strip() for l in lines[1:]]).strip()
        bbc_articles.append(body)
len(bbc_articles)

2225

In [27]:
bbc_articles[0]

'Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.  The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.  Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL\'s underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL\'s existing customers for high-

In [28]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [29]:
doc = nlp(bbc_articles[0])
type(doc)

spacy.tokens.doc.Doc

In [30]:
sentences = [s for s in doc.sents]
sentences[:3]

[Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.  ,
 The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales.,
 TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn.]

In [31]:
pd.DataFrame([[t.text, t.pos_, spacy.explain(t.pos_)] for t in sentences[0]], columns=['Token', 'POS Tag', 'Meaning']).head(10)

Unnamed: 0,Token,POS Tag,Meaning
0,Quarterly,ADJ,adjective
1,profits,NOUN,noun
2,at,ADP,adposition
3,US,PROPN,proper noun
4,media,NOUN,noun
5,giant,NOUN,noun
6,TimeWarner,PROPN,proper noun
7,jumped,VERB,verb
8,76,NUM,numeral
9,%,NOUN,noun


In [32]:
options = {'compact': True, 'bg': '#09a3d5', 'color': 'white', 'font': 'Source Sans Pro'}
displacy.render(sentences[0].as_doc(), style='dep', jupyter=True, options=options)

In [33]:
for t in sentences[0]:
    if t.ent_type_:
        print('{} | {} | {}'.format(t.text, t.ent_type_, spacy.explain(t.ent_type_)))

Quarterly | DATE | Absolute or relative dates or periods
US | GPE | Countries, cities, states
TimeWarner | ORG | Companies, agencies, institutions, etc.
76 | PERCENT | Percentage, including "%"
% | PERCENT | Percentage, including "%"
1.13bn | MONEY | Monetary values, including unit
the | DATE | Absolute or relative dates or periods
three | DATE | Absolute or relative dates or periods
months | DATE | Absolute or relative dates or periods
to | DATE | Absolute or relative dates or periods
December | DATE | Absolute or relative dates or periods
639 | MONEY | Monetary values, including unit
year | DATE | Absolute or relative dates or periods
- | DATE | Absolute or relative dates or periods
earlier | DATE | Absolute or relative dates or periods


In [34]:
displacy.render(sentences[0].as_doc(), style='ent', jupyter=True)

In [35]:
entities = [e.text for e in entities(doc)]
pd.Series(entities).value_counts().head()

TimeWarner        7
AOL               5
fourth quarter    3
2003              2
full-year         2
dtype: int64

In [49]:
entities

['Quarterly',
 'US',
 'TimeWarner',
 '76%',
 '1.13bn',
 'three months to December',
 '639',
 'year-earlier',
 'Google',
 'TimeWarner',
 'fourth quarter',
 '2%',
 '11.1bn',
 '10.9bn',
 'one',
 'Warner Bros',
 'AOL',
 'Time Warner',
 'Friday',
 '8%',
 'Google',
 'AOL',
 '464,000',
 'fourth quarter',
 'preceding three quarters',
 'AOL',
 '8%',
 'TimeWarner',
 'AOL',
 'TimeWarner',
 '2000',
 '2003',
 'US Securities Exchange Commission',
 'SEC',
 "Time Warner's",
 'fourth quarter',
 '27%',
 '284',
 'Alexander',
 'Catwoman',
 'year-earlier',
 'third',
 'Rings',
 'full-year',
 'TimeWarner',
 '3.36bn',
 '27%',
 '2003',
 '6.4%',
 '42.09bn',
 'full-year',
 'Richard Parsons',
 '2005',
 'TimeWarner',
 'around 5%',
 'TimeWarner',
 'AOL',
 'US',
 '$300m',
 'SEC',
 '500',
 'German',
 'Bertelsmann',
 'AOL Europe',
 'AOL Europe']

In [36]:
# N-Grams with textacy
pd.Series([n.text for n in ngrams(doc, n=2, min_freq=2)]).value_counts()

fourth quarter     3
Time Warner        2
quarter profits    2
company said       2
AOL Europe         2
dtype: int64

In [38]:
# The spaCy streaming Pipeline API
iter_texts = (bbc_articles[i] for i in range(len(bbc_articles)))
for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_process=8)):
    if i % 100 == 0:
        print(i, end = ' ')
    assert doc.is_parsed

0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 

In [44]:
# Multi-language Features
en = 'en_core_web_sm'
es = 'es_core_news_sm'

model = {}
for language in [en, es]:
    model[language] = spacy.load(language)

# Read bilingual TED2013 samples
text = {}
path = Path('data', 'TED')
for language in ['en', 'es']:
    file_name = path /  f'TED2013_sample.{language}'
    text[language] = file_name.read_text()

# Sentence Boundaries English vs Spanish
parsed, sentences = {}, {}
for language in ['en', 'es']:
    mdl = en if language is 'en' else es
    # for language MDL using a parallel corpus
    parsed[language] = model[mdl](text[language])
    sentences[language] = list(parsed[language].sents)
    print('Sentences:', language, len(sentences[language]))

Sentences: en 19
Sentences: es 22


In [45]:
for i, (en, es) in enumerate(zip(sentences['en'], sentences['es']), 1):
    print('\n', i)
    print('English:\t', en)
    print('Spanish:\t', es)
    if i > 5:
        break


 1
English:	 There's a tight and surprising link between the ocean's health and ours, says marine biologist Stephen Palumbi.
Spanish:	 Existe una estrecha y sorprendente relaci처n entre nuestra salud y la salud del oc챕ano, dice el biologo marino Stephen Palumbi.

 2
English:	 He shows how toxins at the bottom of the ocean food chain find their way into our bodies, with a shocking story of toxic contamination from a Japanese fish market.
Spanish:	 Nos muestra, atrav챕s de una impactante historia acerca de la contaminaci처n t처xica en el mercado pesquero japon챕s, como las toxinas de la cadena alimenticia del fondo oce찼nico llegan a nuestro cuerpo.

 3
English:	 His work points a way forward for saving the oceans' health -- and humanity's. fish,health,mission blue,oceans,science 899 Stephen Palumbi:
Spanish:	 fish,health,mission blue,oceans,science 899 Stephen Palumbi: Siguiendo el camino del mercurio.

 4
English:	 Following the mercury trail It can be a very complicated thing, the ocean.
S

In [50]:
# POS Tagging English vs Spanish
pos = {}
for language in ['en', 'es']:
    pos[language] = pd.DataFrame([[t.text, t.pos_, spacy.explain(t.pos_)] for t in sentences[language][0]],
                                 columns=['Token', 'POS Tag', 'Meaning'])
bilingual_parsed = pd.concat([pos['en'], pos['es']], axis=0)
bilingual_parsed.head(15)

Unnamed: 0,Token,POS Tag,Meaning
0,There,PRON,pronoun
1,'s,VERB,verb
2,a,DET,determiner
3,tight,ADJ,adjective
4,and,CCONJ,coordinating conjunction
5,surprising,ADJ,adjective
6,link,NOUN,noun
7,between,ADP,adposition
8,the,DET,determiner
9,ocean,NOUN,noun


In [47]:
displacy.render(sentences['es'][0].as_doc(), style='dep', jupyter=True, options=options)