In [30]:
# ========== Exercise ==========

# 1) You are parsing a news story from cnbc.com. News story is stores in [news_story.txt]
# which is available in this same folder on github. You need to,
#    i. Extract all NOUN tokens from this story. You will have to read the file in python first to collect all the text 
#       and then extract NOUNs in a python list
#    ii. Extract all numbers (NUM POS type) in a python list
#    iii. Print a count of all POS tags in this story

import spacy
nlp = spacy.load("en_core_web_sm")

In [31]:
# =============== DRAFT ================
with open("news_story.txt") as f:
    text = f.readlines()
text

['Inflation rose again in April, continuing a climb that has pushed consumers to the brink and is threatening the economic expansion, the Bureau of Labor Statistics reported Wednesday.\n',
 '\n',
 'The consumer price index, a broad-based measure of prices for goods and services, increased 8.3% from a year ago, higher than the Dow Jones estimate for an 8.1% gain. That represented a slight ease from Marchâ€™s peak but was still close to the highest level since the summer of 1982.\n',
 '\n',
 'Removing volatile food and energy prices, so-called core CPI still rose 6.2%, against expectations for a 6% gain, clouding hopes that inflation had peaked in March.\n',
 '\n',
 'The month-over-month gains also were higher than expectations â€” 0.3% on headline CPI versus the 0.2% estimate and a 0.6% increase for core, against the outlook for a 0.4% gain.\n',
 '\n',
 'The price gains also meant that workers continued to lose ground. Real wages adjusted for inflation decreased 0.1% on the month despit

In [32]:
text = ' '.join(text)

doc = nlp(text)


In [40]:
# Extract NOUN tokens
nouns_list = []

for token in doc:
    if token.pos_ in "NOUN":
        nouns_list.append(token)
nouns_list

[Inflation,
 climb,
 consumers,
 brink,
 expansion,
 consumer,
 price,
 index,
 measure,
 prices,
 goods,
 services,
 %,
 year,
 estimate,
 %,
 gain,
 ease,
 Marchâ€,
 ™,
 peak,
 level,
 summer,
 food,
 energy,
 prices,
 core,
 %,
 expectations,
 %,
 gain,
 hopes,
 inflation,
 month,
 month,
 gains,
 expectations,
 %,
 headline,
 %,
 estimate,
 %,
 increase,
 core,
 outlook,
 %,
 gain,
 price,
 gains,
 workers,
 ground,
 wages,
 inflation,
 %,
 month,
 increase,
 %,
 earnings,
 year,
 earnings,
 %,
 earnings,
 %,
 Inflation,
 threat,
 recovery,
 pandemic,
 economy,
 stage,
 year,
 growth,
 level,
 prices,
 pump,
 grocery,
 stores,
 problem,
 inflation,
 areas,
 housing,
 auto,
 sales,
 host,
 areas,
 officials,
 problem,
 interest,
 rate,
 hikes,
 year,
 pledges,
 inflation,
 %,
 goal,
 ™,
 data,
 job,
 Credits]

In [34]:
# Extract NUM tokens
numbers_list = []

for token in doc:
    if token.pos_ in "NUM":
        numbers_list.append(token)
numbers_list

[8.3,
 8.1,
 1982,
 6.2,
 6,
 â€,
 0.3,
 0.2,
 0.6,
 0.4,
 0.1,
 0.3,
 2.6,
 5.5,
 2021,
 1984,
 one,
 two,
 two,
 2]

In [36]:
# Print a count of all POS tags
count = doc.count_by(spacy.attrs.POS)

for k, v in count.items():
    print(doc.vocab[k].text, "|", v)

NOUN | 98
VERB | 27
ADV | 15
ADP | 39
PROPN | 17
PUNCT | 32
DET | 34
PRON | 4
AUX | 13
CCONJ | 10
ADJ | 23
SPACE | 7
NUM | 20
PART | 4
SCONJ | 8
X | 1


In [38]:
# ================ OFFICIAL =================
# Read a new story
# r (reading): default, so 'r' can be omitted (without 'r' no problem)
with open("news_story.txt", "r") as f:
    # read(): retrieves the entire content as a single string
    # readline(): reads one line at a time
    # readlines(): returns a list containing all lines in the file.
    news_text = f.read()
# Get the first 500 words in news_text (read file)
news_text[:500]

'Inflation rose again in April, continuing a climb that has pushed consumers to the brink and is threatening the economic expansion, the Bureau of Labor Statistics reported Wednesday.\n\nThe consumer price index, a broad-based measure of prices for goods and services, increased 8.3% from a year ago, higher than the Dow Jones estimate for an 8.1% gain. That represented a slight ease from Marchâ€™s peak but was still close to the highest level since the summer of 1982.\n\nRemoving volatile food and ene'

In [41]:
# Extract NOUN and NUM tokens
doc = nlp(news_text)

numeral_tokens = []
noun_tokens = []

for token in doc:
    if token.pos_ == "NOUN":
        noun_tokens.append(token)
    elif token.pos_ == 'NUM':
        numeral_tokens.append(token)

In [42]:
numeral_tokens[:10]

[8.3, 8.1, 1982, 6.2, 6, â€, 0.3, 0.2, 0.6, 0.4]

In [43]:
noun_tokens[:10]

[Inflation,
 climb,
 consumers,
 brink,
 expansion,
 consumer,
 price,
 index,
 measure,
 prices]

In [47]:
# Print a count of all POS tags
count = doc.count_by(spacy.attrs.POS)
count

{92: 98,
 100: 27,
 86: 15,
 85: 39,
 96: 17,
 97: 32,
 90: 34,
 95: 4,
 87: 13,
 89: 10,
 84: 23,
 103: 7,
 93: 20,
 94: 4,
 98: 8,
 101: 1}

In [46]:
for k,v in count.items():
    print(doc.vocab[k].text, "|",v)

NOUN | 98
VERB | 27
ADV | 15
ADP | 39
PROPN | 17
PUNCT | 32
DET | 34
PRON | 4
AUX | 13
CCONJ | 10
ADJ | 23
SPACE | 7
NUM | 20
PART | 4
SCONJ | 8
X | 1
