# Importing SpaCy in English

In [1]:
import spacy
nlp = spacy.load("en_core_web_lg")


# trying it out on a small text

In [2]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)
for token in doc:
    print(token, end=" | ")


My | best | friend | Ryan | Peters | likes | fancy | adventure | games | . | 

# the attributes that SpaCy adds

In [3]:
import pandas as pd

def display_nlp(doc, include_punct=False):
    """Generate data frame for visualization of spaCy tokens."""
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct or include_punct:
            row = {'token': i,  'text': t.text, 'lemma_': t.lemma_, 
                   'is_stop': t.is_stop, 'is_alpha': t.is_alpha,
                   'pos_': t.pos_, 'dep_': t.dep_, 
                   'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_}
            rows.append(row)
    
    df = pd.DataFrame(rows).set_index('token')
    df.index.name = None
    return df
display_nlp(doc)


Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_
0,My,my,True,True,PRON,poss,,O
1,best,good,False,True,ADJ,amod,,O
2,friend,friend,False,True,NOUN,nsubj,,O
3,Ryan,Ryan,False,True,PROPN,compound,PERSON,B
4,Peters,Peters,False,True,PROPN,appos,PERSON,I
5,likes,like,False,True,VERB,ROOT,,O
6,fancy,fancy,False,True,ADJ,amod,,O
7,adventure,adventure,False,True,NOUN,compound,,O
8,games,game,False,True,NOUN,dobj,,O


# Removing Stop words using Spacy

In [4]:
text = "Dear Ryan, we need to sit down and talk. Regards, Pete"
doc = nlp(text)

non_stop = [t for t in doc if not t.is_stop and not t.is_punct]
print(non_stop)


[Dear, Ryan, need, sit, talk, Regards, Pete]


# Find all nouns using Spacy

In [5]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

nouns = [t for t in doc if t.pos_ in ['NOUN', 'PROPN']]
print(nouns)


[friend, Ryan, Peters, adventure, games]


# Named Entity Recognition

In [6]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")


(Ryan Peters, PERSON) 

In [7]:
text = "James O'Neill, chairman of World Cargo Inc, lives in San Francisco." 
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")


(James O'Neill, PERSON) (World Cargo Inc, ORG) (San Francisco, GPE) 

# Visualize NERS

In [8]:
from spacy import displacy

displacy.render(doc, style='ent', jupyter=True)


# We will pull an article from the web and use it as our data, a reuters new article

In [9]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://medium.com/predict/tesla-is-no-longer-a-growth-company-heres-why-2d6c0a064547')
article = nlp(ny_bb)
len(article.ents)


12

In [10]:
ny_bb

'Tesla Is No Longer A Growth Company, Here’s Why | by Will Lockett | Predict | Mar, 2024 | MediumOpen in appSign upSign inWriteSign upSign inPhoto by David von Diemar on UnsplashMember-only storyTesla Is No Longer A Growth Company, Here’s WhyTesla isn’t the company it once was.Will Lockett·FollowPublished inPredict·6 min read·5 days ago--19ShareFor the past decade, Tesla has been the growth company. It has revolutionised several industries and, in the process, garnered a truly gargantuan market share and driven its stock price to the Moon. However, this disruptive and revolutionary approach can’t produce growth indefinitely, and Tesla is coming to the end of its…----19FollowWritten by Will Lockett192K Followers·Editor for PredictIndependent journalist covering global politics, climate change and technology. Get articles early at www.planetearthandbeyond.coFollowHelpStatusAboutCareersBlogPrivacyTermsText to speechTeams '

# the NERS

In [11]:
displacy.render(article, style='ent', jupyter=True)


# Most popular NER type

In [12]:
from collections import Counter

# Assuming `article` is your SpaCy document
labels = [x.label_ for x in article.ents]
counter = Counter(labels)
print(counter)


Counter({'ORG': 6, 'PERSON': 4, 'DATE': 2})


In [13]:
items = [x.text for x in article.ents]
Counter(items).most_common(5)


[('Tesla', 3),
 ('Will Lockett', 1),
 ('Mar, 2024 | MediumOpen', 1),
 ('David von Diemar', 1),
 ('UnsplashMember', 1)]

# Most popular NER

In [14]:
items = [x.text for x in article.ents]
Counter(items).most_common(5)


[('Tesla', 3),
 ('Will Lockett', 1),
 ('Mar, 2024 | MediumOpen', 1),
 ('David von Diemar', 1),
 ('UnsplashMember', 1)]

# Taking One sentence to analyze

In [15]:
sentences = [x for x in article.sents]
print(sentences[0])


Tesla Is No Longer A Growth Company, Here’s Why | by Will Lockett | Predict | Mar, 2024 | MediumOpen in appSign upSign inWriteSign upSign inPhoto by David von Diemar on UnsplashMember-only storyTesla Is No Longer A Growth Company, Here’s WhyTesla isn’t the company it once was.


# NER tags

In [16]:
displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')


# Types of words in the sentence

In [17]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[0])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]


[('Tesla', 'PROPN', 'Tesla'),
 ('Longer', 'ADJ', 'long'),
 ('Growth', 'PROPN', 'Growth'),
 ('Company', 'PROPN', 'Company'),
 ('|', 'VERB', '|'),
 ('Lockett', 'PROPN', 'Lockett'),
 ('|', 'AUX', '|'),
 ('Predict', 'VERB', 'predict'),
 ('|', 'VERB', '|'),
 ('Mar', 'PROPN', 'Mar'),
 ('2024', 'NUM', '2024'),
 ('|', 'SYM', '|'),
 ('MediumOpen', 'PROPN', 'MediumOpen'),
 ('appSign', 'NOUN', 'appsign'),
 ('upSign', 'ADJ', 'upsign'),
 ('inWriteSign', 'PROPN', 'inWriteSign'),
 ('upSign', 'NOUN', 'upsign'),
 ('inPhoto', 'INTJ', 'inphoto'),
 ('David', 'PROPN', 'David'),
 ('von', 'PROPN', 'von'),
 ('Diemar', 'PROPN', 'Diemar'),
 ('UnsplashMember', 'PROPN', 'UnsplashMember'),
 ('storyTesla', 'NOUN', 'storytesla'),
 ('Longer', 'ADJ', 'long'),
 ('Growth', 'PROPN', 'Growth'),
 ('Company', 'PROPN', 'Company'),
 ('WhyTesla', 'PROPN', 'WhyTesla'),
 ('company', 'NOUN', 'company')]

# Sentence dependency tree

In [18]:
displacy.render(nlp(str(sentences[0])), style='dep', jupyter = True, options = {'distance': 120})
