In [99]:
import pandas as pd
import spacy
from nltk.corpus import stopwords

In [100]:
bbc_data = pd.read_csv("./data_CSV/bbc_news.csv")
bbc_data.head(5)

Unnamed: 0.1,Unnamed: 0,index,title,pubDate,guid,link,description
0,0,6684,Can I refuse to work?,"Wed, 10 Aug 2022 15:46:18 GMT",https://www.bbc.co.uk/news/business-62147992,https://www.bbc.co.uk/news/business-62147992?a...,With much of the UK enduring another period of...
1,1,9267,'Liz Truss the Brief?' World reacts to UK poli...,"Mon, 17 Oct 2022 11:35:12 GMT",https://www.bbc.co.uk/news/world-63285480,https://www.bbc.co.uk/news/world-63285480?at_m...,The UK's political chaos has been watched arou...
2,2,7387,Rationing energy is nothing new for off-grid c...,"Wed, 31 Aug 2022 05:20:18 GMT",https://www.bbc.co.uk/news/uk-scotland-highlan...,https://www.bbc.co.uk/news/uk-scotland-highlan...,Scoraig in the north west Highlands has long h...
3,3,767,The hunt for superyachts of sanctioned Russian...,"Tue, 22 Mar 2022 14:37:01 GMT",https://www.bbc.co.uk/news/60739336,https://www.bbc.co.uk/news/60739336?at_medium=...,"Wealthy Russians sanctioned by the US, EU and ..."
4,4,3712,Platinum Jubilee: 70 years of the Queen in 70 ...,"Wed, 01 Jun 2022 23:17:33 GMT",https://www.bbc.co.uk/news/uk-61660128,https://www.bbc.co.uk/news/uk-61660128?at_medi...,A quick look back at the Queen's 70 years on t...


In [101]:
bbc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   1000 non-null   int64 
 1   index        1000 non-null   int64 
 2   title        1000 non-null   object
 3   pubDate      1000 non-null   object
 4   guid         1000 non-null   object
 5   link         1000 non-null   object
 6   description  1000 non-null   object
dtypes: int64(2), object(5)
memory usage: 54.8+ KB


Cleaning the Data

In [102]:
#Only focused on the titles of the data. We create a new dataframe to only focus on the titles of each BBC article
titles_data = pd.DataFrame(bbc_data.title)
titles_data.head(5)

Unnamed: 0,title
0,Can I refuse to work?
1,'Liz Truss the Brief?' World reacts to UK poli...
2,Rationing energy is nothing new for off-grid c...
3,The hunt for superyachts of sanctioned Russian...
4,Platinum Jubilee: 70 years of the Queen in 70 ...


In [103]:
#Lowercase
titles_data["lowercase"] = titles_data.title.str.lower()
titles_data.head(5)

Unnamed: 0,title,lowercase
0,Can I refuse to work?,can i refuse to work?
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...


In [104]:
#stopwords
en_stopwords = stopwords.words("english")
titles_data["no_stopwords"] = titles_data["lowercase"].apply(lambda x: " ".join(word for word in x.split() if word not in en_stopwords))
titles_data.head()

Unnamed: 0,title,lowercase,no_stopwords
0,Can I refuse to work?,can i refuse to work?,refuse work?
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds


In [105]:
#punctuation
import re
titles_data["no_punctuation"] = titles_data["no_stopwords"].apply(lambda x: re.sub(r"[^\w\s]","", x))
titles_data.head()

Unnamed: 0,title,lowercase,no_stopwords,no_punctuation
0,Can I refuse to work?,can i refuse to work?,refuse work?,refuse work
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liz truss brief world reacts uk political turmoil
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationing energy nothing new offgrid community
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,hunt superyachts sanctioned russian oligarchs
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinum jubilee 70 years queen 70 seconds


In [106]:
#tokenize
from nltk.tokenize import word_tokenize
titles_data["tokenize_raw"] = titles_data["title"].apply(lambda x: word_tokenize(x))
titles_data["tokenize_clean"] = titles_data["no_punctuation"].apply(lambda x: word_tokenize(x))
titles_data.head()


Unnamed: 0,title,lowercase,no_stopwords,no_punctuation,tokenize_raw,tokenize_clean
0,Can I refuse to work?,can i refuse to work?,refuse work?,refuse work,"[Can, I, refuse, to, work, ?]","[refuse, work]"
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liz truss brief world reacts uk political turmoil,"['Liz, Truss, the, Brief, ?, ', World, reacts,...","[liz, truss, brief, world, reacts, uk, politic..."
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationing energy nothing new offgrid community,"[Rationing, energy, is, nothing, new, for, off...","[rationing, energy, nothing, new, offgrid, com..."
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,hunt superyachts sanctioned russian oligarchs,"[The, hunt, for, superyachts, of, sanctioned, ...","[hunt, superyachts, sanctioned, russian, oliga..."
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinum jubilee 70 years queen 70 seconds,"[Platinum, Jubilee, :, 70, years, of, the, Que...","[platinum, jubilee, 70, years, queen, 70, seco..."


In [107]:
#lemmatize
from nltk.stem import WordNetLemmatizer
net = WordNetLemmatizer()
titles_data["lemmatize"] = titles_data["tokenize_clean"].apply(lambda tokens: [net.lemmatize(token) for token in tokens])
titles_data.head()

Unnamed: 0,title,lowercase,no_stopwords,no_punctuation,tokenize_raw,tokenize_clean,lemmatize
0,Can I refuse to work?,can i refuse to work?,refuse work?,refuse work,"[Can, I, refuse, to, work, ?]","[refuse, work]","[refuse, work]"
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liz truss brief world reacts uk political turmoil,"['Liz, Truss, the, Brief, ?, ', World, reacts,...","[liz, truss, brief, world, reacts, uk, politic...","[liz, truss, brief, world, reacts, uk, politic..."
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationing energy nothing new offgrid community,"[Rationing, energy, is, nothing, new, for, off...","[rationing, energy, nothing, new, offgrid, com...","[rationing, energy, nothing, new, offgrid, com..."
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,hunt superyachts sanctioned russian oligarchs,"[The, hunt, for, superyachts, of, sanctioned, ...","[hunt, superyachts, sanctioned, russian, oliga...","[hunt, superyachts, sanctioned, russian, oliga..."
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinum jubilee 70 years queen 70 seconds,"[Platinum, Jubilee, :, 70, years, of, the, Que...","[platinum, jubilee, 70, years, queen, 70, seco...","[platinum, jubilee, 70, year, queen, 70, second]"


In [108]:
raw_list = sum(titles_data["tokenize_raw"], [])
clean_list = sum(titles_data["tokenize_clean"], [])

POS_tagging

In [109]:
nlp = spacy.load("en_core_web_sm")
spacy_doc = nlp(" ".join(raw_list))
pos_df = pd.DataFrame(columns=["token", "pos_tag"]) 


In [110]:
#pandas is creating a dataframe from a dictionary

#1) pd.Dataframe.from_records({["token":token, "pos_tag":token.pos_tag]}) - creates a new dataframe for each token 
#2) pd.concat(pos_df,pd.Dataframe.from_records({["token":token, "pos_tag":token.pos_tag]})) - append the current Dataframe with the latest entry. Imagine 
#it as a recursive functions appending itself. 
#3), use this as a reference "pd.concat([s1, s2], ignore_index=True)"

for token in spacy_doc:
    pos_df = pd.concat([pos_df, pd.DataFrame.from_records([{"token": token.text, "pos_tag": token.pos_}])], ignore_index=True)

In [111]:
#Were creating a pos_tagging counter. Which counts the number of tokes that occur and label their tag as well. 
pos_df_counts = pos_df.groupby(["token", "pos_tag"]).size().reset_index(name="count").sort_values(by="count", ascending=False)
pos_df_counts.head(10)


Unnamed: 0,token,pos_tag,count
95,:,PUNCT,543
8,',PUNCT,300
2897,in,ADP,187
4082,to,PART,175
3268,of,ADP,172
22,-,PUNCT,166
4043,the,DET,163
1856,and,CCONJ,147
15,'s,PART,143
97,?,PUNCT,130


In [112]:
#reference to make sure I get the same output as intended
"""tokens_raw_list = sum(titles_data['tokenize_raw'], []) #unpack our lists into a single list
nlp = spacy.load('en_core_web_sm')
spacy_doc = nlp(' '.join(tokens_raw_list))
# extract the tokens and pos tags into a dataframe
pos_df = pd.DataFrame(columns=['token', 'pos_tag'])

for token in spacy_doc:
    pos_df = pd.concat([pos_df,
                       pd.DataFrame.from_records([{'token': token.text,'pos_tag': token.pos_}])], ignore_index=True)
# token frequency count
pos_df_counts = pos_df.groupby(['token','pos_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)
pos_df_counts.head(10)"""

"tokens_raw_list = sum(titles_data['tokenize_raw'], []) #unpack our lists into a single list\nnlp = spacy.load('en_core_web_sm')\nspacy_doc = nlp(' '.join(tokens_raw_list))\n# extract the tokens and pos tags into a dataframe\npos_df = pd.DataFrame(columns=['token', 'pos_tag'])\n\nfor token in spacy_doc:\n    pos_df = pd.concat([pos_df,\n                       pd.DataFrame.from_records([{'token': token.text,'pos_tag': token.pos_}])], ignore_index=True)\n# token frequency count\npos_df_counts = pos_df.groupby(['token','pos_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)\npos_df_counts.head(10)"

Personal note: When extracting tokens and their tags, always use token.text instead of just token. token is a spaCy object, and if you try to count those objects directly, each will be treated as unique, leading to a count of 1 for every token. By using token.text, you ensure you're working with the string value of the token, which is what you want for counting or analysis.

In [113]:
#only focusing on any tokens as nouns
pos_df_noun = pos_df_counts[pos_df_counts.pos_tag=="NOUN"][0:10]
pos_df_noun

Unnamed: 0,token,pos_tag,count
4267,war,NOUN,35
3552,record,NOUN,15
3416,police,NOUN,14
4316,win,NOUN,14
4356,year,NOUN,14
3061,living,NOUN,13
4009,tax,NOUN,13
3368,people,NOUN,12
2326,day,NOUN,12
2031,boss,NOUN,11


In [114]:
pos_df_verb = pos_df_counts[pos_df_counts.pos_tag=="VERB"][0:10]
pos_df_verb

Unnamed: 0,token,pos_tag,count
3687,says,VERB,30
9,',VERB,14
2670,found,VERB,13
4317,win,VERB,12
4324,wins,VERB,10
2713,get,VERB,9
2388,dies,VERB,9
3990,take,VERB,8
2982,killed,VERB,8
3745,set,VERB,8


NES: Named Entity Recognition

In [127]:
# This code loops through all the entities of  the spacy_doc entities because of "spacy_doc.ents"
# pd.isna checks whether a value is missing or NaN. So, pd.isna(tokens.label_) means that the token has no label (it's missing or NaN). 
#Using is False ensures the token does have a valid label and it's not NaN

nes_df=pd.DataFrame(columns=["token","nes_tag"])
for tokens in spacy_doc.ents:
    if pd.isna(tokens.label_) is False:
        nes_df = pd.concat([nes_df, pd.DataFrame.from_records([{"token":tokens.text, "nes_tag":tokens.label_}])], ignore_index=True)
nes_df.head(10)

Unnamed: 0,token,nes_tag
0,Liz Truss,PERSON
1,UK,GPE
2,Rationing,PRODUCT
3,superyachts,CARDINAL
4,Russian,NORP
5,70 years,DATE
6,70 seconds,TIME
7,Red Bull,ORG
8,Formula 1 's,PRODUCT
9,World Triathlon Championship Series,EVENT


In [129]:
nes_df_count= nes_df.groupby(["token","nes_tag"]).size().reset_index(name="count").sort_values(by="count",ascending=False)
nes_df_count[0:10]

Unnamed: 0,token,nes_tag,count
965,Ukraine,GPE,47
955,UK,GPE,36
329,England,GPE,32
819,Russian,NORP,20
957,US,GPE,19
1031,World Cup 2022,EVENT,18
1058,first,ORDINAL,13
918,The Papers,WORK_OF_ART,13
378,France,GPE,12
226,China,GPE,11


In [132]:
nes_df_person = nes_df_count[nes_df_count.nes_tag == "PERSON"]
nes_df_person[0:10]

Unnamed: 0,token,nes_tag,count
257,Covid,PERSON,9
757,Putin,PERSON,8
760,Queen,PERSON,8
563,Liz Truss,PERSON,6
169,Boris Johnson,PERSON,6
788,Rishi Sunak,PERSON,5
515,Jurgen Klopp,PERSON,4
762,Quiz,PERSON,4
325,Emma Raducanu,PERSON,4
581,Macron,PERSON,4
