In [1]:
import pandas as pd
import spacy
import numpy as np

In [3]:
nlp = spacy.load("en_core_web_sm")
paragraph = "it was a quiet afternoon, the sun casting long shadows across the dusty floorboards. a gentle breeze drifted through the open window, rustling the yellowed pages of an old book left forgotten on the table. outside, a lone sparrow hopped along the fence, its tiny chirps the only sound to break the stillness. the world seemed to hold its breath, waiting for something to happen, though nothing ever really did. this peaceful, unchanging rhythm was a comfort, a constant in a world that often felt too loud and too fast. the hours slipped by unnoticed, and soon the soft glow of twilight began to paint the sky in shades of orange and purple."
documented_tokens = nlp(paragraph)


In [8]:
pos_df = pd.DataFrame(columns = ["tokens", "pos_tag"])
for tokens in documented_tokens:
    pos_df = pd.concat([pos_df, pd.DataFrame([{"tokens": tokens.text, "pos_tag": tokens.pos_}])], ignore_index = True)
pos_df.head(10)

Unnamed: 0,tokens,pos_tag
0,it,PRON
1,was,AUX
2,a,DET
3,quiet,ADJ
4,afternoon,NOUN
5,",",PUNCT
6,the,DET
7,sun,NOUN
8,casting,VERB
9,long,ADJ


We want to look at the most common tokens and their associated pos_tags

In [27]:
#print(pos_df.groupby("tokens").value_counts().reset_index().sort_values(by=["count"], ascending = False))
ts = pos_df.groupby(["tokens", "pos_tag"]).size().reset_index(name = "count").sort_values(by = "count", ascending = False)

How many different words are in each of the pos_tags

In [29]:
cs = ts.groupby("pos_tag").size().reset_index(name = "word_count").sort_values(by = "word_count", ascending = False)
cs.head(10)

Unnamed: 0,pos_tag,word_count
6,NOUN,27
12,VERB,16
0,ADJ,13
1,ADP,8
2,ADV,7
8,PRON,5
5,DET,4
10,PUNCT,2
3,AUX,1
4,CCONJ,1


List all tokens that included the word noun within the data set. 

In [46]:
#Noun_series = ts.tokens[pos_df.pos_tag == "NOUN"]
Noun_series = pos_df[pos_df.pos_tag == "NOUN"]
print(Noun_series)

          tokens pos_tag
4      afternoon    NOUN
7            sun    NOUN
10       shadows    NOUN
13         dusty    NOUN
14   floorboards    NOUN
18        breeze    NOUN
23        window    NOUN
28         pages    NOUN
32          book    NOUN
37         table    NOUN
43       sparrow    NOUN
47         fence    NOUN
51        chirps    NOUN
54         sound    NOUN
58     stillness    NOUN
61         world    NOUN
66        breath    NOUN
84        rhythm    NOUN
87       comfort    NOUN
90      constant    NOUN
93         world    NOUN
104        hours    NOUN
113         glow    NOUN
115     twilight    NOUN
120          sky    NOUN
122       shades    NOUN
124       orange    NOUN
126       purple    NOUN


In [47]:
from spacy import displacy
from spacy import tokenizer
import re

This is an attempt for doing Named Entity Recognition

In [53]:
news = """Apple announced on Monday that it will invest $2 billion in new data centers in Texas 
and North Carolina. CEO Tim Cook said the facilities are expected to open in 2026, creating more 
than 5,000 jobs. Meanwhile, President Joe Biden praised the decision during a press conference 
in Washington, calling it a major step for the U.S. economy and technology sector."""
spacy_doc = nlp(news)

In [54]:
for token in spacy_doc.ents:
    print(token.text, token.label_)

Apple ORG
Monday DATE
$2 billion MONEY
Texas GPE
North Carolina GPE
Tim Cook PERSON
2026 DATE
5,000 CARDINAL
Joe Biden PERSON
Washington GPE
U.S. GPE


In [56]:
displacy.render(spacy_doc, style = "ent", jupyter = True)