# Importing SpaCy in English

In [1]:
import spacy
nlp = spacy.load("en_core_web_lg")


# trying it out on a small text

In [2]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)
for token in doc:
    print(token, end=" | ")


My | best | friend | Ryan | Peters | likes | fancy | adventure | games | . | 

# the attributes that SpaCy adds

In [3]:
import pandas as pd

def display_nlp(doc, include_punct=False):
    """Generate data frame for visualization of spaCy tokens."""
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct or include_punct:
            row = {'token': i,  'text': t.text, 'lemma_': t.lemma_, 
                   'is_stop': t.is_stop, 'is_alpha': t.is_alpha,
                   'pos_': t.pos_, 'dep_': t.dep_, 
                   'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_}
            rows.append(row)
    
    df = pd.DataFrame(rows).set_index('token')
    df.index.name = None
    return df
display_nlp(doc)


Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_
0,My,my,True,True,PRON,poss,,O
1,best,good,False,True,ADJ,amod,,O
2,friend,friend,False,True,NOUN,nsubj,,O
3,Ryan,Ryan,False,True,PROPN,compound,PERSON,B
4,Peters,Peters,False,True,PROPN,appos,PERSON,I
5,likes,like,False,True,VERB,ROOT,,O
6,fancy,fancy,False,True,ADJ,amod,,O
7,adventure,adventure,False,True,NOUN,compound,,O
8,games,game,False,True,NOUN,dobj,,O


# Removing Stop words using Spacy

In [4]:
text = "Dear Ryan, we need to sit down and talk. Regards, Pete"
doc = nlp(text)

non_stop = [t for t in doc if not t.is_stop and not t.is_punct]
print(non_stop)


[Dear, Ryan, need, sit, talk, Regards, Pete]


# Find all nouns using Spacy

In [5]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

nouns = [t for t in doc if t.pos_ in ['NOUN', 'PROPN']]
print(nouns)


[friend, Ryan, Peters, adventure, games]


# Named Entity Recognition

In [6]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")


(Ryan Peters, PERSON) 

In [7]:
text = "James O'Neill, chairman of World Cargo Inc, lives in San Francisco." 
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")


(James O'Neill, PERSON) (World Cargo Inc, ORG) (San Francisco, GPE) 

# Visualize NERS

In [8]:
from spacy import displacy

displacy.render(doc, style='ent', jupyter=True)


# We will pull an article from the web and use it as our data, a reuters new article

In [9]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://medium.com/@sahana.singh/if-you-care-about-justice-and-truth-then-why-do-you-call-my-celebration-of-ram-temple-as-hindu-ee2e8a137d59#id_token=eyJhbGciOiJSUzI1NiIsImtpZCI6ImFkZjVlNzEwZWRmZWJlY2JlZmE5YTYxNDk1NjU0ZDAzYzBiOGVkZjgiLCJ0eXAiOiJKV1QifQ.eyJpc3MiOiJodHRwczovL2FjY291bnRzLmdvb2dsZS5jb20iLCJhenAiOiIyMTYyOTYwMzU4MzQtazFrNnFlMDYwczJ0cDJhMmphbTRsamRjbXMwMHN0dGcuYXBwcy5nb29nbGV1c2VyY29udGVudC5jb20iLCJhdWQiOiIyMTYyOTYwMzU4MzQtazFrNnFlMDYwczJ0cDJhMmphbTRsamRjbXMwMHN0dGcuYXBwcy5nb29nbGV1c2VyY29udGVudC5jb20iLCJzdWIiOiIxMDUyMjU4MTA2MzM0MDk0MjU5MjgiLCJlbWFpbCI6InJvaGl0LmE3OTg5QGdtYWlsLmNvbSIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJuYmYiOjE3MTExNTMxNDIsIm5hbWUiOiJSb2hpdCBBIiwicGljdHVyZSI6Imh0dHBzOi8vbGgzLmdvb2dsZXVzZXJjb250ZW50LmNvbS9hL0FDZzhvY0lwZ3FWWU9rdHRtVUtvMFI1aWhUTVZoSlp4T1pJRDJvcGJaR1dsY3Nnaj1zOTYtYyIsImdpdmVuX25hbWUiOiJSb2hpdCIsImZhbWlseV9uYW1lIjoiQSIsImlhdCI6MTcxMTE1MzQ0MiwiZXhwIjoxNzExMTU3MDQyLCJqdGkiOiJkZWNiMzVhNDkwYmUzMmU4ZDE1OGI3YTVkYTA0YTAxYjM5YWIzMTAxIn0.a7wWaxz6Ngp_qB3BbtmAtxlj4-xwdYOcuif-S0osyeBaHTw6w-LhBqx_OrX4g9hLnEqJYBXuc9Z8RT_o333CoODTDQN3DQziRKG-Ygdfp0PbyrhlweUahFu0oQqghB9kCUu5AdFj80mRxHg2i_Xn8y9kpg5jFA3XGN0KeIRYZpgyfYMU2w5D84rIph7OIzWXauBGnxpcWxzNQFhnqWAsj5X_JFO8YM4Y3WS9eTk-hgfEMJX9SDaolgzkTzbNBxv4geBbAe94cPb5PQwtpKku2twtWh0Wx9vn0TmxVyLT24NsdflIo4gNgtWH8MJ9wQIGGFbBNfJZpjE-VOHE3RdLuw')
article = nlp(ny_bb)
len(article.ents)


37

In [10]:
ny_bb

'If you care about justice and truth then why do you call my celebration of Ram Temple as “Hindu Triumphalism”? | by Sahana Singh | Jan, 2024 | MediumOpen in appSign upSign inWriteSign upSign inMember-only storyIf you care about justice and truth then why do you call my celebration of Ram Temple as “Hindu Triumphalism”?Sahana Singh·Follow13 min read·Jan 29, 2024--7ShareThe newly inaugurated Ram Mandir at Ayodhya, India. Photo: Press Information Bureau, Government of IndiaJanuary 22, 2024 is a date that will be etched in the minds of Hindus all over the world. It is the date when a temple of immeasurable significance, which was demolished by Mughal invader Babur in 1528 was rebuilt and inaugurated with joyous celebrations. It was a step towards correcting a portion of the injustice that has been festering for centuries.But the massive ignorance and even deliberate obfuscation of truth in the western world about why the temple was rebuilt is rather shocking.Imagine the scene in the 16th 

# the NERS

In [11]:
displacy.render(article, style='ent', jupyter=True)


# Most popular NER type

In [12]:
from collections import Counter

# Assuming `article` is your SpaCy document
labels = [x.label_ for x in article.ents]
counter = Counter(labels)
print(counter)


Counter({'PERSON': 14, 'NORP': 8, 'DATE': 5, 'ORG': 4, 'CARDINAL': 3, 'GPE': 3})


In [13]:
items = [x.text for x in article.ents]
Counter(items).most_common(5)


[('Hindu', 3),
 ('Hindus', 3),
 ('Ram Temple', 2),
 ('India', 2),
 ('Sahana Singh', 1)]

# Most popular NER

In [14]:
items = [x.text for x in article.ents]
Counter(items).most_common(5)


[('Hindu', 3),
 ('Hindus', 3),
 ('Ram Temple', 2),
 ('India', 2),
 ('Sahana Singh', 1)]

# Taking One sentence to analyze

In [15]:
sentences = [x for x in article.sents]
print(sentences[0])


If you care about justice and truth then why do you call my celebration of Ram Temple as “Hindu Triumphalism”?


# NER tags

In [16]:
displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')


# Types of words in the sentence

In [17]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[0])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]


[('care', 'VERB', 'care'),
 ('justice', 'NOUN', 'justice'),
 ('truth', 'NOUN', 'truth'),
 ('celebration', 'NOUN', 'celebration'),
 ('Ram', 'PROPN', 'Ram'),
 ('Temple', 'PROPN', 'Temple'),
 ('Hindu', 'PROPN', 'Hindu'),
 ('Triumphalism', 'PROPN', 'Triumphalism')]

# Sentence dependency tree

In [18]:
displacy.render(nlp(str(sentences[0])), style='dep', jupyter = True, options = {'distance': 120})
