# Importing Spacy

In [9]:
#importing SpaCy
import spacy
# Load the English language model
nlp = spacy.load("en_core_web_lg")


# Tokenization

In [10]:
# Assign the text to the variable
text = "New kickoff format, ban on hip-drop tackle among 10 rule proposals to be heard at Annual League Meeting"
# Process the title using spaCy
doc = nlp(text)
for token in doc:
    print(token, end="|")

New|kickoff|format|,|ban|on|hip|-|drop|tackle|among|10|rule|proposals|to|be|heard|at|Annual|League|Meeting|

# Visulisation on Tokens

In [11]:
#Importing pandas
import pandas as pd

def display_nlp(doc, include_punct=False):
    """Generate data frame for visualization of spaCy tokens."""
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct or include_punct:
            row = {'token': i,  'text': t.text, 'lemma_': t.lemma_, 
                   'is_stop': t.is_stop, 'is_alpha': t.is_alpha,
                   'pos_': t.pos_, 'dep_': t.dep_, 
                   'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_}
            rows.append(row)
    
    df = pd.DataFrame(rows).set_index('token')
    df.index.name = None
    return df
display_nlp(doc)

  from pandas.core import (


Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_
0,New,new,False,True,ADJ,amod,,O
1,kickoff,kickoff,False,True,NOUN,compound,,O
2,format,format,False,True,NOUN,ROOT,,O
4,ban,ban,False,True,NOUN,appos,,O
5,on,on,True,True,ADP,prep,,O
6,hip,hip,False,True,NOUN,compound,,O
8,drop,drop,False,True,NOUN,compound,,O
9,tackle,tackle,False,True,NOUN,pobj,,O
10,among,among,True,True,ADP,prep,,O
11,10,10,False,False,NUM,nummod,CARDINAL,B


# Removing Stop Words and punctuations 

In [15]:
# initializing text variable
text = "Watch USC QB Caleb Williams' pro day live on NFL+ at 1:30 p.m. ET"
doc = nlp(text)

non_stop = [t for t in doc if not t.is_stop and not t.is_punct]
print(non_stop)

[Watch, USC, QB, Caleb, Williams, pro, day, live, NFL+, 1:30, p.m., ET]


# Removing Nouns and Prnouns

In [36]:
#initializes a text variable 
text = "Arrest warrant issued for Lions CB Cameron Sutton for alleged domestic battery"
doc = nlp(text)
#checkS for having a part-of-speech tag that corresponds to a noun or proper noun
nouns = [t for t in doc if t.pos_ in ['NOUN', 'PROPN']]
print(nouns)


[warrant, Lions, CB, Cameron, Sutton, battery]


# Named Entity Recongisation 

In [37]:
#intializes text variable
text = "Mike Williams sees 'great fit' with Aaron Rodgers, Jets: 'A pretty good opportunity ahead of us'"
#processes the text using the spaCy library's language model
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(Mike Williams, PERSON) (Aaron Rodgers, PERSON) (Jets, ORG) 

In [38]:
#initializes a variable
text = "James O'Neill, chairman of World Cargo Inc, lives in San Francisco." 
doc = nlp(text)
#The label represents the type of the named entity
for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(James O'Neill, PERSON) (World Cargo Inc, ORG) (San Francisco, GPE) 

# Visualization on NERS

In [39]:
#imports the `displacy` module from spaCy to visualize named entities in text
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

# Pulling an Web Article 

In [40]:
#pull an article from the web and use it as our data, a reuters new article:
from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')  # Specify 'html.parser' as the parser
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
#fetches the content of the NFL news article from the specified URL.
ny_bb = url_to_string('https://www.nfl.com/news/')
article = nlp(ny_bb)
len(article.ents)


158

# Visulisation on NERS 

In [41]:
#visualization style is set to 'ent'
displacy.render(article, style='ent', jupyter=True)

# Most Popular NER Types

In [42]:
#imports the Counter class from the collections
from collections import Counter
#labels of named entities in the web article are extracted and stored 
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'ORG': 64,
         'GPE': 28,
         'NORP': 2,
         'LOC': 3,
         'CARDINAL': 12,
         'EVENT': 6,
         'DATE': 19,
         'MONEY': 2,
         'PERSON': 9,
         'WORK_OF_ART': 3,
         'ORDINAL': 1,
         'PRODUCT': 7,
         'FAC': 2})

In [43]:
# Importing the Counter class from the collections module
items = [x.text for x in article.ents]
Counter(items).most_common(5)


[('NFL', 9), ('2024', 6), ('2024 season', 3), ('Mar 26,', 3), ('Houston', 2)]

# Extracting Sentences 

In [44]:
# Extracting the sentences from the processed web article using spaCy's sentence
sentences = [x for x in article.sents]
print(sentences[20])

Pro BowlMenu ShopMenu StandingsMenu StatsMenu


# Displaying NER Tags

In [46]:
# Rendering a visualization of named entities in the first sentence
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

# Types of Sentences

In [26]:
#filters out tokens that are stop words or punctuation marks, then retrieves the specified attributes for each token.
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[0])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[(' ', 'SPACE', ' '), ('NFL', 'PROPN', 'NFL'), ('News', 'PROPN', 'News')]

# Sentence Dependency tree

In [47]:
#Rendering a visualization of dependency parsing in the first sentence
displacy.render(nlp(str(sentences[20])), style='dep', jupyter = True, options = {'distance': 120})