In [1]:
import pandas as pd
from nltk.corpus import stopwords
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import spacy



In [2]:
data = pd.read_csv('bbc_news.csv')
print(data.head())

   Unnamed: 0  index                                              title  \
0           0   6684                              Can I refuse to work?   
1           1   9267  'Liz Truss the Brief?' World reacts to UK poli...   
2           2   7387  Rationing energy is nothing new for off-grid c...   
3           3    767  The hunt for superyachts of sanctioned Russian...   
4           4   3712  Platinum Jubilee: 70 years of the Queen in 70 ...   

                         pubDate  \
0  Wed, 10 Aug 2022 15:46:18 GMT   
1  Mon, 17 Oct 2022 11:35:12 GMT   
2  Wed, 31 Aug 2022 05:20:18 GMT   
3  Tue, 22 Mar 2022 14:37:01 GMT   
4  Wed, 01 Jun 2022 23:17:33 GMT   

                                                guid  \
0       https://www.bbc.co.uk/news/business-62147992   
1          https://www.bbc.co.uk/news/world-63285480   
2  https://www.bbc.co.uk/news/uk-scotland-highlan...   
3                https://www.bbc.co.uk/news/60739336   
4             https://www.bbc.co.uk/news/uk-61660128

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   1000 non-null   int64 
 1   index        1000 non-null   int64 
 2   title        1000 non-null   object
 3   pubDate      1000 non-null   object
 4   guid         1000 non-null   object
 5   link         1000 non-null   object
 6   description  1000 non-null   object
dtypes: int64(2), object(5)
memory usage: 54.8+ KB


In [4]:
# Pick title column
titles = pd.DataFrame(data['title'])
titles.head(10)

Unnamed: 0,title
0,Can I refuse to work?
1,'Liz Truss the Brief?' World reacts to UK poli...
2,Rationing energy is nothing new for off-grid c...
3,The hunt for superyachts of sanctioned Russian...
4,Platinum Jubilee: 70 years of the Queen in 70 ...
5,Red Bull found guilty of breaking Formula 1's ...
6,World Triathlon Championship Series: Flora Duf...
7,Terry Hall: Coventry scooter ride-out pays tri...
8,Post Office and Fujitsu to face inquiry over H...
9,'Pavement parking frightens me'


Clean Data

In [5]:
# Change title to Lowercase
titles['lowercase'] = titles['title'].str.lower()
titles.head(10)

Unnamed: 0,title,lowercase
0,Can I refuse to work?,can i refuse to work?
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...
5,Red Bull found guilty of breaking Formula 1's ...,red bull found guilty of breaking formula 1's ...
6,World Triathlon Championship Series: Flora Duf...,world triathlon championship series: flora duf...
7,Terry Hall: Coventry scooter ride-out pays tri...,terry hall: coventry scooter ride-out pays tri...
8,Post Office and Fujitsu to face inquiry over H...,post office and fujitsu to face inquiry over h...
9,'Pavement parking frightens me','pavement parking frightens me'


In [6]:
# Remove all stopwords
en_stopwords = stopwords.words('english')
titles['no_stopwords'] = titles['lowercase'].apply(lambda x: ' '.join(
    [word for word in x.split() if word not in (en_stopwords)]))
titles.head(10)

Unnamed: 0,title,lowercase,no_stopwords
0,Can I refuse to work?,can i refuse to work?,refuse work?
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds
5,Red Bull found guilty of breaking Formula 1's ...,red bull found guilty of breaking formula 1's ...,red bull found guilty breaking formula 1's bud...
6,World Triathlon Championship Series: Flora Duf...,world triathlon championship series: flora duf...,world triathlon championship series: flora duf...
7,Terry Hall: Coventry scooter ride-out pays tri...,terry hall: coventry scooter ride-out pays tri...,terry hall: coventry scooter ride-out pays tri...
8,Post Office and Fujitsu to face inquiry over H...,post office and fujitsu to face inquiry over h...,post office fujitsu face inquiry horizon scandal
9,'Pavement parking frightens me','pavement parking frightens me','pavement parking frightens me'


In [7]:
# Remove Punctuations
titles['no_stopwords_no_punctuation'] = titles.apply(
    lambda x: re.sub(r"([^\w\s])", "", x['no_stopwords']), axis=1)

titles.head(10)

Unnamed: 0,title,lowercase,no_stopwords,no_stopwords_no_punctuation
0,Can I refuse to work?,can i refuse to work?,refuse work?,refuse work
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liz truss brief world reacts uk political turmoil
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationing energy nothing new offgrid community
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,hunt superyachts sanctioned russian oligarchs
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinum jubilee 70 years queen 70 seconds
5,Red Bull found guilty of breaking Formula 1's ...,red bull found guilty of breaking formula 1's ...,red bull found guilty breaking formula 1's bud...,red bull found guilty breaking formula 1s budg...
6,World Triathlon Championship Series: Flora Duf...,world triathlon championship series: flora duf...,world triathlon championship series: flora duf...,world triathlon championship series flora duff...
7,Terry Hall: Coventry scooter ride-out pays tri...,terry hall: coventry scooter ride-out pays tri...,terry hall: coventry scooter ride-out pays tri...,terry hall coventry scooter rideout pays tribu...
8,Post Office and Fujitsu to face inquiry over H...,post office and fujitsu to face inquiry over h...,post office fujitsu face inquiry horizon scandal,post office fujitsu face inquiry horizon scandal
9,'Pavement parking frightens me','pavement parking frightens me','pavement parking frightens me',pavement parking frightens me


In [8]:
# Tokenize
titles['tokens_raw'] = titles.apply(lambda x: word_tokenize(x['title']), axis=1)
titles['tokens_clean'] = titles.apply(lambda x: word_tokenize(x['no_stopwords_no_punctuation']), axis=1)
titles.head(10)

Unnamed: 0,title,lowercase,no_stopwords,no_stopwords_no_punctuation,tokens_raw,tokens_clean
0,Can I refuse to work?,can i refuse to work?,refuse work?,refuse work,"[Can, I, refuse, to, work, ?]","[refuse, work]"
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liz truss brief world reacts uk political turmoil,"['Liz, Truss, the, Brief, ?, ', World, reacts,...","[liz, truss, brief, world, reacts, uk, politic..."
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationing energy nothing new offgrid community,"[Rationing, energy, is, nothing, new, for, off...","[rationing, energy, nothing, new, offgrid, com..."
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,hunt superyachts sanctioned russian oligarchs,"[The, hunt, for, superyachts, of, sanctioned, ...","[hunt, superyachts, sanctioned, russian, oliga..."
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinum jubilee 70 years queen 70 seconds,"[Platinum, Jubilee, :, 70, years, of, the, Que...","[platinum, jubilee, 70, years, queen, 70, seco..."
5,Red Bull found guilty of breaking Formula 1's ...,red bull found guilty of breaking formula 1's ...,red bull found guilty breaking formula 1's bud...,red bull found guilty breaking formula 1s budg...,"[Red, Bull, found, guilty, of, breaking, Formu...","[red, bull, found, guilty, breaking, formula, ..."
6,World Triathlon Championship Series: Flora Duf...,world triathlon championship series: flora duf...,world triathlon championship series: flora duf...,world triathlon championship series flora duff...,"[World, Triathlon, Championship, Series, :, Fl...","[world, triathlon, championship, series, flora..."
7,Terry Hall: Coventry scooter ride-out pays tri...,terry hall: coventry scooter ride-out pays tri...,terry hall: coventry scooter ride-out pays tri...,terry hall coventry scooter rideout pays tribu...,"[Terry, Hall, :, Coventry, scooter, ride-out, ...","[terry, hall, coventry, scooter, rideout, pays..."
8,Post Office and Fujitsu to face inquiry over H...,post office and fujitsu to face inquiry over h...,post office fujitsu face inquiry horizon scandal,post office fujitsu face inquiry horizon scandal,"[Post, Office, and, Fujitsu, to, face, inquiry...","[post, office, fujitsu, face, inquiry, horizon..."
9,'Pavement parking frightens me','pavement parking frightens me','pavement parking frightens me',pavement parking frightens me,"['Pavement, parking, frightens, me, ']","[pavement, parking, frightens, me]"


In [9]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
titles['tokens_clean_lemmantized'] = titles['tokens_clean'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])
titles.head(10)

Unnamed: 0,title,lowercase,no_stopwords,no_stopwords_no_punctuation,tokens_raw,tokens_clean,tokens_clean_lemmantized
0,Can I refuse to work?,can i refuse to work?,refuse work?,refuse work,"[Can, I, refuse, to, work, ?]","[refuse, work]","[refuse, work]"
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liz truss brief world reacts uk political turmoil,"['Liz, Truss, the, Brief, ?, ', World, reacts,...","[liz, truss, brief, world, reacts, uk, politic...","[liz, truss, brief, world, reacts, uk, politic..."
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationing energy nothing new offgrid community,"[Rationing, energy, is, nothing, new, for, off...","[rationing, energy, nothing, new, offgrid, com...","[rationing, energy, nothing, new, offgrid, com..."
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,hunt superyachts sanctioned russian oligarchs,"[The, hunt, for, superyachts, of, sanctioned, ...","[hunt, superyachts, sanctioned, russian, oliga...","[hunt, superyachts, sanctioned, russian, oliga..."
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinum jubilee 70 years queen 70 seconds,"[Platinum, Jubilee, :, 70, years, of, the, Que...","[platinum, jubilee, 70, years, queen, 70, seco...","[platinum, jubilee, 70, year, queen, 70, second]"
5,Red Bull found guilty of breaking Formula 1's ...,red bull found guilty of breaking formula 1's ...,red bull found guilty breaking formula 1's bud...,red bull found guilty breaking formula 1s budg...,"[Red, Bull, found, guilty, of, breaking, Formu...","[red, bull, found, guilty, breaking, formula, ...","[red, bull, found, guilty, breaking, formula, ..."
6,World Triathlon Championship Series: Flora Duf...,world triathlon championship series: flora duf...,world triathlon championship series: flora duf...,world triathlon championship series flora duff...,"[World, Triathlon, Championship, Series, :, Fl...","[world, triathlon, championship, series, flora...","[world, triathlon, championship, series, flora..."
7,Terry Hall: Coventry scooter ride-out pays tri...,terry hall: coventry scooter ride-out pays tri...,terry hall: coventry scooter ride-out pays tri...,terry hall coventry scooter rideout pays tribu...,"[Terry, Hall, :, Coventry, scooter, ride-out, ...","[terry, hall, coventry, scooter, rideout, pays...","[terry, hall, coventry, scooter, rideout, pay,..."
8,Post Office and Fujitsu to face inquiry over H...,post office and fujitsu to face inquiry over h...,post office fujitsu face inquiry horizon scandal,post office fujitsu face inquiry horizon scandal,"[Post, Office, and, Fujitsu, to, face, inquiry...","[post, office, fujitsu, face, inquiry, horizon...","[post, office, fujitsu, face, inquiry, horizon..."
9,'Pavement parking frightens me','pavement parking frightens me','pavement parking frightens me',pavement parking frightens me,"['Pavement, parking, frightens, me, ']","[pavement, parking, frightens, me]","[pavement, parking, frightens, me]"


In [10]:
# Put the tokens into a a single list
tokens_raw_list = sum(titles['tokens_raw'], [])
tokens_clean_list = sum(titles['tokens_clean_lemmantized'], [])

POS TAGGING

In [11]:
nlp = spacy.load('en_core_web_sm')

# Create a spacy doc from the clean text - better for POS tagging
spacy_doc = nlp(' '.join(tokens_clean_list))

In [12]:
# Extract the tokens and POS tags into a dataframe
pos_df = pd.DataFrame(columns=['Token', 'pos_tag'])

for token in spacy_doc:
    pos_df = pd.concat([pos_df, pd.DataFrame.from_records(
        [{'token': token.text, 'pos_tag': token.pos_}])], ignore_index=True)

In [13]:
# Token frequency count
pos_df_counts = pos_df.groupby(['token', 'pos_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)
print(pos_df_counts.head(10))

        token pos_tag  counts
30       2022     NUM      47
1162  england   PROPN      45
870       cup   PROPN      39
3056      say    VERB      37
3707       uk   PROPN      37
3840      war    NOUN      34
2386      new     ADJ      31
3948    world    NOUN      30
3949    world   PROPN      26
3710  ukraine   PROPN      23


In [14]:
# MOST COMMON NOUNS
nouns = pos_df_counts[pos_df_counts.pos_tag == 'NOUN'][0:10]
print(nouns)

         token pos_tag  counts
3840       war    NOUN      34
3948     world    NOUN      30
2136       man    NOUN      22
907        day    NOUN      21
3973      year    NOUN      20
1158    energy    NOUN      17
2847    record    NOUN      17
3935     woman    NOUN      16
1130  election    NOUN      16
3870      week    NOUN      16


In [15]:
# MOST COMMON VERBS
verbs = pos_df_counts[pos_df_counts.pos_tag == 'VERB'][0:10]
print(verbs)

        token pos_tag  counts
3056      say    VERB      37
3711  ukraine    VERB      22
358      beat    VERB      13
2133     make    VERB      13
3461     take    VERB      13
1651      hit    VERB      13
1459      get    VERB      13
1380    found    VERB      13
1473     give    VERB      11
3918      win    VERB      10


NAMED ENTITY RECOGNITION TAGGING

In [16]:
# Extract the tokens and entity tags into a dataframe
ner_df = pd.DataFrame(columns=['token', 'ner_tag'])

for token in spacy_doc.ents:
    if pd.isna(token.label_) is False:
        ner_df = pd.concat([ner_df, pd.DataFrame.from_records(
            [{'token': token.text, 'ner_tag': token.label_}])], ignore_index=True)
        
ner_df.head(20)

Unnamed: 0,token,ner_tag
0,russian,NORP
1,70 year,DATE
2,70 second,TIME
3,bull,ORG
4,1,CARDINAL
5,georgia taylorbrown womens,ORG
6,terry hall,PERSON
7,six,CARDINAL
8,99,CARDINAL
9,jubilee beacon,PERSON


In [17]:
# NER Token frequency count
ner_df_counts = ner_df.groupby(['token', 'ner_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)

ner_df_counts.head(12)

Unnamed: 0,token,ner_tag,counts
34,2022,CARDINAL,30
451,russian,NORP,25
223,first,ORDINAL,15
35,2022,DATE,11
450,russia,GPE,10
233,france,GPE,10
339,london,GPE,9
519,tory,NORP,9
531,uk,GPE,8
409,one,CARDINAL,8


In [18]:
# Most common people
people = ner_df_counts[ner_df_counts.ner_tag == 'PERSON'][0:10]
people

Unnamed: 0,token,ner_tag,counts
432,putin,PERSON,7
127,boris johnson,PERSON,5
106,antonio conte,PERSON,3
102,andy murray,PERSON,3
254,hodgkinson,PERSON,2
249,harry,PERSON,2
120,beth,PERSON,2
160,chris,PERSON,2
217,eu,PERSON,2
252,harry meghan,PERSON,2


In [19]:
# Most common places
places = ner_df_counts[ner_df_counts.ner_tag == 'GPE'][0:10]
places

Unnamed: 0,token,ner_tag,counts
450,russia,GPE,10
233,france,GPE,10
339,london,GPE,9
531,uk,GPE,8
211,england,GPE,7
156,china,GPE,7
479,south africa,GPE,7
335,liverpool,GPE,7
109,australia,GPE,7
261,india,GPE,6
