In [None]:
import pandas as pd
from nltk.corpus import stopwords
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import spacy

In [None]:
data = pd.read_csv('bbc_news.csv')
print(data.head())

In [None]:
data.info()

In [None]:
# Pick title column
titles = pd.DataFrame(data['title'])
titles.head(10)

Clean Data

In [None]:
# Change title to Lowercase
titles['lowercase'] = titles['title'].str.lower()
titles.head(10)

In [None]:
# Remove all stopwords
en_stopwords = stopwords.words('english')
titles['no_stopwords'] = titles['lowercase'].apply(lambda x: ' '.join(
    [word for word in x.split() if word not in (en_stopwords)]))
titles.head(10)

In [None]:
# Remove Punctuations
titles['no_stopwords_no_punctuation'] = titles.apply(
    lambda x: re.sub(r"([^\w\s])", "", x['no_stopwords']), axis=1)

titles.head(10)

In [None]:
# Tokenize
titles['tokens_raw'] = titles.apply(lambda x: word_tokenize(x['title']), axis=1)
titles['tokens_clean'] = titles.apply(lambda x: word_tokenize(x['no_stopwords_no_punctuation']), axis=1)
titles.head(10)

In [None]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
titles['tokens_clean_lemmantized'] = titles['tokens_clean'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])
titles.head(10)

In [None]:
# Put the tokens into a a single list
tokens_raw_list = sum(titles['tokens_raw'], [])
tokens_clean_list = sum(titles['tokens_clean_lemmantized'], [])

POS TAGGING

In [None]:
nlp = spacy.load('en_core_web_sm')

# Create a spacy doc from the clean text - better for POS tagging
spacy_doc = nlp(' '.join(tokens_clean_list))

In [None]:
# Extract the tokens and POS tags into a dataframe
pos_df = pd.DataFrame(columns=['Token', 'pos_tag'])

for token in spacy_doc:
    pos_df = pd.concat([pos_df, pd.DataFrame.from_records(
        [{'token': token.text, 'pos_tag': token.pos_}])], ignore_index=True)

In [None]:
# Token frequency count
pos_df_counts = pos_df.groupby(['token', 'pos_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)
print(pos_df_counts.head(10))

In [None]:
# MOST COMMON NOUNS
nouns = pos_df_counts[pos_df_counts.pos_tag == 'NOUN'][0:10]
print(nouns)

In [None]:
# MOST COMMON VERBS
verbs = pos_df_counts[pos_df_counts.pos_tag == 'VERB'][0:10]
print(verbs)

NAMED ENTITY RECOGNITION TAGGING

In [None]:
# Extract the tokens and entity tags into a dataframe
ner_df = pd.DataFrame(columns=['token', 'ner_tag'])

for token in spacy_doc.ents:
    if pd.isna(token.label_) is False:
        ner_df = pd.concat([ner_df, pd.DataFrame.from_records(
            [{'token': token.text, 'ner_tag': token.label_}])], ignore_index=True)
        
ner_df.head(20)

In [None]:
# NER Token frequency count
ner_df_counts = ner_df.groupby(['token', 'ner_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)

ner_df_counts.head(12)

In [None]:
# Most common people
people = ner_df_counts[ner_df_counts.ner_tag == 'PERSON'][0:10]
people

In [None]:
# Most common places
places = ner_df_counts[ner_df_counts.ner_tag == 'GPE'][0:10]
places