In [284]:
!pip3 install -U spacy
# from spacy.lang.en import English # Do I need this?

import spacy
from spacy import displacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_md')

# # run in terminal if needed
# /Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m
You should consider upgrading via the '/usr/local/opt/python@3.9/bin/python3.9 -m pip install --upgrade pip' command.[0m


In [279]:
# load text
doc = nlp("On 11 August 2021, Thomas uses the Apple Macbook for practicing Python while traveling in Greece!! Testing 1, two, 3")

# load a text file
file_doc = nlp(open('test.txt').read())

# Is more efficient? timeit says microscopically
                    # file_name = 'test.txt'
                    # file_text = open(file_name).read()
                    # file_doc = nlp(file_text)
                    # open(file_name).close()
doc

On 11 August 2021, Thomas uses the Apple Macbook for practicing Python while traveling in Greece!! Testing 1, two, 3

.

# Tokens
doing things with them

In [283]:
# text,  lemmatization, number-ish?, index,  part of speech, dependency structure 
[(token.text, token.lemma_, token.like_num, token.i, token.pos_, spacy.explain(token.pos_), token.dep_, token.is_stop) for token in doc]

# can add the following: token.is_alpha, token.is_punct 

[('On', 'on', False, 0, 'ADP', 'adposition', 'prep', True),
 ('11', '11', True, 1, 'NUM', 'numeral', 'nummod', False),
 ('August', 'August', False, 2, 'PROPN', 'proper noun', 'pobj', False),
 ('2021', '2021', True, 3, 'NUM', 'numeral', 'nummod', False),
 (',', ',', False, 4, 'PUNCT', 'punctuation', 'punct', False),
 ('Thomas', 'Thomas', False, 5, 'PROPN', 'proper noun', 'nsubj', False),
 ('uses', 'use', False, 6, 'VERB', 'verb', 'ROOT', False),
 ('the', 'the', False, 7, 'DET', 'determiner', 'det', True),
 ('Apple', 'Apple', False, 8, 'PROPN', 'proper noun', 'compound', False),
 ('Macbook', 'Macbook', False, 9, 'PROPN', 'proper noun', 'dobj', False),
 ('for', 'for', False, 10, 'ADP', 'adposition', 'prep', True),
 ('practicing', 'practice', False, 11, 'VERB', 'verb', 'pcomp', False),
 ('Python', 'Python', False, 12, 'PROPN', 'proper noun', 'dobj', False),
 ('while',
  'while',
  False,
  13,
  'SCONJ',
  'subordinating conjunction',
  'mark',
  True),
 ('traveling', 'travel', False, 14, 

In [232]:
# What is it? 
print(type(doc))        #it's a spacy doc
print(type(doc[2]))     #it's a spacy token

# grabbing one based on index
print(doc[2])

# shape of the word
print(doc[2].shape_)


<class 'spacy.tokens.doc.Doc'>
<class 'spacy.tokens.token.Token'>
August
Xxxxx


In [188]:
# What the hell is this stuff?
print(spacy.explain("CARDINAL"))
print(spacy.explain("GPE"))
print(spacy.explain("DET"))

Numerals that do not fall under another type
Countries, cities, states
determiner


.

# Entities 
doing things with them
(have the md library loaded up)

In [189]:
#show the ents
print(doc.ents)


# make sure to import displacy
print()
displacy.render(doc, style="ent")

# shows that each ent is also a span
print()
print(type(doc.ents[0]))

(11 August 2021, Thomas, the Apple Macbook, Greece, 1, two, 3)




<class 'spacy.tokens.span.Span'>


.

# Visualizations

In [193]:
# # standard displacy rendering
# displacy.render(doc)

# render by sentences
displacy.render(doc.sents)

# cooler one
displacy.render(doc, style="ent")

.

# Stop words

In [256]:
# to see all non stop words in a doc
noStopWordsHere = []
hereBeStopWords = []
for token in doc:
    if not token.is_stop:
        noStopWordsHere.append(token.text)
    else:
        hereBeStopWords.append(token.text)
        
        
print(len(set(noStopWordsHere))) #how many non stop words in deduped list?
print(len(set(hereBeStopWords))) #vs how many stop words in deduped list?
print(set(hereBeStopWords))      #print out the stop words in deduped list.

17
6
{'in', 'the', 'while', 'two', 'for', 'On'}


.

## Other things  🤷‍♂️


In [160]:
# get only the nouns
list([i for i in doc.noun_chunks])

[11 August, Thomas, the Apple Macbook, Python, Greece, Testing]

In [162]:
# get all the sentences
list([i for i in doc.sents])

[On 11 August 2021, Thomas uses the Apple Macbook for practicing Python while traveling in Greece!!,
 Testing 1, two, 3]

In [237]:
# Strings to hashes
word_hash = nlp.vocab.strings['Important']
print(word_hash)

# Hashes to strings
word_string = nlp.vocab.strings[word_hash]
print(word_string)

5854064017242738633
Use


In [170]:
# list out the vocab? idk...
# list([i.text for i in doc.vocab])

In [None]:
# # Word vectors and similarity based on model you have loaded up (en_core_web_sm)
# nlp('Nice').has_vector  
# nlp('Nice').vector
# nlp('Thomas').vector_norm

In [180]:
nlp('dog').similarity(nlp('cat'))   # don't use the 'sm' model which doesn't have word vectors loaded

animals = [
    'snake',
    'tabby',
    'lion',
    'cats',
    'dogs',
    'cat',
    'dog',
    'panther',
    'fish',
    'spoon',
    'bowl',
    'chair',
    'lamp',
    'iguana',
    'rodent',
    'feline',
    'meow',
    'furball',
    'bird',
    'catepillar'
]

# compare likeness to 'cat'
[(nlp(i).similarity(nlp('cat')), i) for i in animals]

[(0.48167102784100796, 'snake'),
 (0.8215554289062845, 'tabby'),
 (0.5265436766852823, 'lion'),
 (0.8409757687002631, 'cats'),
 (0.6921648076691282, 'dogs'),
 (1.0, 'cat'),
 (0.8016854705531046, 'dog'),
 (0.5413389826292829, 'panther'),
 (0.4180653522513407, 'fish'),
 (0.24077498521219193, 'spoon'),
 (0.2946319117471937, 'bowl'),
 (0.27362862685425554, 'chair'),
 (0.23974348597327558, 'lamp'),
 (0.43194828699956045, 'iguana'),
 (0.5321326245349185, 'rodent'),
 (1.0000000913918174, 'feline'),
 (0.5460768016223961, 'meow'),
 (0.5639032807711905, 'furball'),
 (0.5236872879447847, 'bird'),
 (-0.06685005407684443, 'catepillar')]

In [172]:
x = doc.to_dict()
x.keys()
# x['text']

dict_keys(['text', 'array_head', 'array_body', 'sentiment', 'tensor', 'cats', 'spans', 'strings', 'has_unknown_spaces'])

In [212]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [228]:
doc[0].shape_

'Xx'

.

# Matcher crap

In [206]:
# Example
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

# Add match ID "HelloWorld" with no callback and one pattern
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("HelloWorld", [pattern])

doc = nlp("Hello, world! Hello, world!")
matches = matcher(doc)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 3 Hello, world
15578876784678163569 HelloWorld 4 7 Hello, world


### My attempts at matcher:

In [236]:
pattern = [{"LEMMA": 'use'}]
matcher.add("Use", [pattern])
matches = matcher(doc)
matches

[(9570226570207762279, 6, 7)]

In [266]:
doc = nlp("On 11 August 2021, Thomas uses the Apple Macbook for practicing Python while traveling in Greece!! \
           Testing 1, two, 3 😅")

for match_id, start, stop in matches:
#     string_id = nlp.vocab.strings[match_id]
    span = doc[start:stop]
    print(match_id, span, span.text)

9570226570207762279 uses uses


In [238]:
nlp.vocab.strings[9570226570207762279] #takes the hash and gives the word

'Use'

In [252]:
%%timeit
for token in doc:
    if token.is_stop == False:
        token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop

18.3 µs ± 145 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [260]:
[token for token in doc if token.is_stop == True]

[On, the, for, while, in, two]

In [261]:
[token for token in doc if token.is_stop == False]

[11,
 August,
 2021,
 ,,
 Thomas,
 uses,
 Apple,
 Macbook,
 practicing,
 Python,
 traveling,
 Greece,
 !,
 !,
            ,
 Testing,
 1,
 ,,
 ,,
 3]

In [274]:
[token for token in doc.noun_chunks]

[11 August,
 Thomas,
 the Apple Macbook,
 Python,
 Greece,
            Testing 1, two, 3 😅]