# Importing SpaCy in English

In [1]:
import spacy
nlp = spacy.load("en_core_web_lg")


# trying it out on a small text

In [2]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)
for token in doc:
    print(token, end=" | ")


My | best | friend | Ryan | Peters | likes | fancy | adventure | games | . | 

# the attributes that SpaCy adds

In [3]:
import pandas as pd

def display_nlp(doc, include_punct=False):
    """Generate data frame for visualization of spaCy tokens."""
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct or include_punct:
            row = {'token': i,  'text': t.text, 'lemma_': t.lemma_, 
                   'is_stop': t.is_stop, 'is_alpha': t.is_alpha,
                   'pos_': t.pos_, 'dep_': t.dep_, 
                   'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_}
            rows.append(row)
    
    df = pd.DataFrame(rows).set_index('token')
    df.index.name = None
    return df
display_nlp(doc)


Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_
0,My,my,True,True,PRON,poss,,O
1,best,good,False,True,ADJ,amod,,O
2,friend,friend,False,True,NOUN,nsubj,,O
3,Ryan,Ryan,False,True,PROPN,compound,PERSON,B
4,Peters,Peters,False,True,PROPN,appos,PERSON,I
5,likes,like,False,True,VERB,ROOT,,O
6,fancy,fancy,False,True,ADJ,amod,,O
7,adventure,adventure,False,True,NOUN,compound,,O
8,games,game,False,True,NOUN,dobj,,O


# Removing Stop words using Spacy

In [4]:
text = "Dear Ryan, we need to sit down and talk. Regards, Pete"
doc = nlp(text)

non_stop = [t for t in doc if not t.is_stop and not t.is_punct]
print(non_stop)


[Dear, Ryan, need, sit, talk, Regards, Pete]


# Find all nouns using Spacy

In [5]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

nouns = [t for t in doc if t.pos_ in ['NOUN', 'PROPN']]
print(nouns)


[friend, Ryan, Peters, adventure, games]


# Named Entity Recognition

In [6]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")


(Ryan Peters, PERSON) 

In [7]:
text = "James O'Neill, chairman of World Cargo Inc, lives in San Francisco." 
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")


(James O'Neill, PERSON) (World Cargo Inc, ORG) (San Francisco, GPE) 

# Visualize NERS

In [8]:
from spacy import displacy

displacy.render(doc, style='ent', jupyter=True)


# We will pull an article from the web and use it as our data, a reuters new article

In [32]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://medium.com/p/c2100386790b')
article = nlp(ny_bb)
len(article.ents)


105

In [33]:
ny_bb

'Saying “Hamas Just Needs To Surrender” Is Saying “We’ll Kill Kids Until We Get What We Want” | by Caitlin Johnstone | Mar, 2024 | MediumOpen in appSign upSign inWriteSign upSign inSaying “Hamas Just Needs To Surrender” Is Saying “We’ll Kill Kids Until We Get What We Want”Caitlin Johnstone·Follow5 min read·Mar 9, 2024--12ListenShareListen to a reading of this article (reading by Tim Foley):Of the many awful warmonger comments President Biden made in his State of the Union address Thursday night, arguably the worst was when he reiterated the US empire’s position that it is fine and good for the IDF to keep murdering Gazan civilians until Hamas bows to all of Israel’s demands.Biden did this by lamenting the “heartbreaking” death and starvation of civilians in Gaza while in the same breath stating that Hamas could end all of this violence by laying down arms and surrendering those responsible for the October 7 attack.“Israel has a right to go after Hamas,” Biden said. “Hamas ended this co

# the NERS

In [34]:
displacy.render(article, style='ent', jupyter=True)


# Most popular NER type

In [35]:
from collections import Counter

# Assuming `article` is your SpaCy document
labels = [x.label_ for x in article.ents]
counter = Counter(labels)
print(counter)


Counter({'GPE': 28, 'PERSON': 25, 'ORG': 20, 'DATE': 12, 'NORP': 9, 'CARDINAL': 7, 'WORK_OF_ART': 3, 'TIME': 1})


In [36]:
items = [x.text for x in article.ents]
Counter(items).most_common(5)


[('Hamas', 13), ('Israel', 10), ('Gaza', 9), ('Biden', 8), ('US', 5)]

# Most popular NER

In [37]:
items = [x.text for x in article.ents]
Counter(items).most_common(5)


[('Hamas', 13), ('Israel', 10), ('Gaza', 9), ('Biden', 8), ('US', 5)]

# Taking One sentence to analyze

In [38]:
sentences = [x for x in article.sents]
print(sentences[0])


Saying “Hamas Just Needs To Surrender” Is Saying “We’ll Kill Kids Until We Get What We Want” | by Caitlin Johnstone | Mar, 2024 | MediumOpen in appSign upSign inWriteSign upSign inSaying “Hamas Just Needs To Surrender” Is Saying “We’ll Kill Kids Until We Get What We


# NER tags

In [39]:
displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')


# Types of words in the sentence

In [40]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[0])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]


[('Saying', 'VERB', 'say'),
 ('Hamas', 'PROPN', 'Hamas'),
 ('Needs', 'VERB', 'need'),
 ('Surrender', 'VERB', 'surrender'),
 ('Saying', 'VERB', 'say'),
 ('Kill', 'VERB', 'kill'),
 ('Kids', 'NOUN', 'kid'),
 ('Want', 'VERB', 'want'),
 ('|', 'VERB', '|'),
 ('Caitlin', 'PROPN', 'Caitlin'),
 ('Johnstone', 'PROPN', 'Johnstone'),
 ('Mar', 'PROPN', 'Mar'),
 ('2024', 'NUM', '2024'),
 ('|', 'SYM', '|'),
 ('MediumOpen', 'PROPN', 'MediumOpen'),
 ('appSign', 'NOUN', 'appsign'),
 ('upSign', 'ADJ', 'upsign'),
 ('inWriteSign', 'NOUN', 'inwritesign'),
 ('upSign', 'ADJ', 'upsign'),
 ('inSaying', 'AUX', 'insaying'),
 ('Hamas', 'PROPN', 'Hamas'),
 ('Needs', 'VERB', 'need'),
 ('Surrender', 'VERB', 'surrender'),
 ('Saying', 'VERB', 'say'),
 ('Kill', 'VERB', 'kill'),
 ('Kids', 'NOUN', 'kid')]

# Sentence dependency tree

In [41]:
displacy.render(nlp(str(sentences[0])), style='dep', jupyter = True, options = {'distance': 120})
