In [1]:
#import the library
import spacy


In [3]:
# load the sm model
nlp = spacy.load('en_core_web_sm')

In [4]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x166964172e0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x16696417e80>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x166960d7890>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x1669643abc0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x166964be500>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x166960d7970>)]

In [5]:
txt = "I am a student at the University of Maryland, College Park. I love eating hamburger."	

In [6]:
txt

'I am a student at the University of Maryland, College Park. I love eating hamburger.'

In [7]:
doc = nlp(txt)

In [8]:
doc

I am a student at the University of Maryland, College Park. I love eating hamburger.

In [9]:
# distinguish the difference between the txt and the doc
print(len(txt))
print(len(doc))

84
18


In [10]:
# tokens for the doc
for token in doc:
    print(token.text)

I
am
a
student
at
the
University
of
Maryland
,
College
Park
.
I
love
eating
hamburger
.


# Sentences


In [10]:
for sentence in doc.sents:
    print(sentence.text)

I am a student at the University of Maryland, College Park.
I love to eat hamburgers.


In [11]:
sentence1 = list(doc.sents)[0]
type(sentence1)

spacy.tokens.span.Span

In [12]:
[sentence.text for sentence in doc.sents]

['I am a student at the University of Maryland, College Park.',
 'I love to eat hamburgers.']

# Token Attributes


In [38]:
sentence2 = list(doc.sents)[1]

In [39]:
sentence1.text

'I am a student at the University of Maryland, College Park.'

In [44]:
token2 = sentence2[2]
token2	

eating

In [45]:
token2.lemma_

'eat'

In [46]:
token2.morph

Aspect=Prog|Tense=Pres|VerbForm=Part

In [47]:
token2.pos_

'VERB'

# Parts of Speech

In [48]:
for token in doc:
    print( token.text, token.pos_, token.lemma_)

I PRON I
am AUX be
a DET a
student NOUN student
at ADP at
the DET the
University PROPN University
of ADP of
Maryland PROPN Maryland
, PUNCT ,
College PROPN College
Park PROPN Park
. PUNCT .
I PRON I
love VERB love
eating VERB eat
hamburger NOUN hamburger
. PUNCT .


In [50]:
from spacy import displacy
displacy.render(sentence2, style='dep', jupyter=True)

In [51]:
for token in doc:
    print(spacy.explain(token.tag_))

pronoun, personal
verb, non-3rd person singular present
determiner
noun, singular or mass
conjunction, subordinating or preposition
determiner
noun, proper singular
conjunction, subordinating or preposition
noun, proper singular
punctuation mark, comma
noun, proper singular
noun, proper singular
punctuation mark, sentence closer
pronoun, personal
verb, non-3rd person singular present
verb, gerund or present participle
noun, singular or mass
punctuation mark, sentence closer


In [52]:
# NER 

In [54]:
text2 = "I had a fun time trekking across Europe. It took me 60 hours to get from Spain to Russia by Volkswagen! It was 200000 kilometers"
doc2 = nlp(text2)

In [55]:
for ent in doc2.ents:
    print(ent.text, ent.label_)

Europe LOC
60 hours TIME
Spain GPE
Russia GPE
Volkswagen ORG
200000 kilometers QUANTITY


In [56]:
displacy.render(doc2, style='ent', jupyter=True)

In [62]:
text3 = "Liverpool FC beat Manchester United football club five goals to nil."
doc3 = nlp(text3)

In [63]:
displacy.render(doc3, style='ent', jupyter=True)

# Preprocessing 

In [None]:
# basic idea behind stemming
"""
compute --> comput
computer --> comput
computing --> comput"""


In [65]:
cc = nlp("compute computer  computing")
for token in cc:
    print(token.text, token.lemma_)

compute compute
computer computer
   
computing compute


In [76]:
text4 = "Emre is eating a burek. Giussepe loves eating. Walid loves couscous."
doc4 = nlp(text4)

In [77]:
[token for token in doc4 if not token.is_stop]

[Emre, eating, burek, ., Giussepe, loves, eating, ., Walid, loves, couscous, .]

In [78]:
[token.lemma_ for token in doc4 if not token.is_stop]

['Emre',
 'eat',
 'burek',
 '.',
 'Giussepe',
 'love',
 'eat',
 '.',
 'Walid',
 'love',
 'couscous',
 '.']

In [79]:
sp_stop = spacy.lang.en.stop_words.STOP_WORDS

In [80]:
from collections import Counter

In [81]:
counter = Counter([token.text for token in doc4])

In [82]:
counter

Counter({'Emre': 1,
         'is': 1,
         'eating': 2,
         'a': 1,
         'burek': 1,
         '.': 3,
         'Giussepe': 1,
         'loves': 2,
         'Walid': 1,
         'couscous': 1})

In [84]:
nlp.vocab['loves'].is_stop

False

In [97]:
my_stopwords = ["loves"]
for words in my_stopwords:
    nlp.vocab[words].is_stop = True

In [98]:
for token in doc4:
    if token.is_stop:
        continue
    else:
        print(token.text)

Emre
eating
burek
.
Giussepe
eating
.
Walid
couscous
.


In [93]:
prepocessed = [token.lemma_ for token in doc4 if token.is_s else  not in my_stopwords]
prepocessed

['Emre',
 'be',
 'eat',
 'a',
 'burek',
 '.',
 'Giussepe',
 'eat',
 '.',
 'Walid',
 'couscous',
 '.']

In [103]:
custom_stops = ["Hello", "world"]
text = "Hello, hello  world I am Groot."

for words in custom_stops:
    nlp.vocab[words].is_stop = True

In [105]:
doc6  = nlp(text.lower())
for word in doc6:
    if word.is_stop:
        continue
    else:
        print(word.text)

,
 
groot
.


In [106]:
from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(nlp.vocab)

In [111]:
for token in doc6:
    if token.is_punct:
        continue
    else:
        print(token.text)

hello
hello
 
world
i
am
groot
