In [59]:
import pandas as pd
import numpy as np

import re
import string

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words as nltk_words
from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize

import spacy
import en_core_web_sm

nltk.download('stopwords', download_dir='../../Resources')
nltk.download('punkt', download_dir='../../Resources')
nltk.download('wordnet', download_dir='../../Resources')
nltk.download('omw-1.4', download_dir='../../Resources')
nltk.download('words', download_dir='../../Resources')
nltk.download('averaged_perceptron_tagger', download_dir='../../Resources')

[nltk_data] Downloading package stopwords to ../../Resources...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to ../../Resources...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to ../../Resources...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to ../../Resources...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package words to ../../Resources...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     ../../Resources...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
nltk.data.path.append('../../Resources')

In [3]:
nlp = en_core_web_sm.load()

# Hyposesis:
- remove all the punctuation and stopwords 
- remove all the punctuation, stopwords and lemmatize
-------------- 
_potential problem is that number of keywords is pretty high. Can tackle that in 2 ways:
split the data into 2 data points with random sampling, or just by half, or using manual labelling. Or just drop random sub-set of words, or not random_
- named entity extraction
--------------
_potential problem, same entity for many outputs, e.g. ('War in Ukraine: Taking cover in a town under attack',
 'Jeremy Bowen was on the frontline in Irpin, as residents came under Russian fire while trying to flee.')
 (the only entity at the beginning is Ukraine). When keep only >= 3 entities, dataset becomes pretty small_

In [36]:
data = pd.read_csv('Data/bbc_news.csv')
data.head(5)

Unnamed: 0,title,pubDate,guid,link,description
0,Ukraine: Angry Zelensky vows to punish Russian...,"Mon, 07 Mar 2022 08:01:56 GMT",https://www.bbc.co.uk/news/world-europe-60638042,https://www.bbc.co.uk/news/world-europe-606380...,The Ukrainian president says the country will ...
1,War in Ukraine: Taking cover in a town under a...,"Sun, 06 Mar 2022 22:49:58 GMT",https://www.bbc.co.uk/news/world-europe-60641873,https://www.bbc.co.uk/news/world-europe-606418...,"Jeremy Bowen was on the frontline in Irpin, as..."
2,Ukraine war 'catastrophic for global food',"Mon, 07 Mar 2022 00:14:42 GMT",https://www.bbc.co.uk/news/business-60623941,https://www.bbc.co.uk/news/business-60623941?a...,One of the world's biggest fertiliser firms sa...
3,Manchester Arena bombing: Saffie Roussos's par...,"Mon, 07 Mar 2022 00:05:40 GMT",https://www.bbc.co.uk/news/uk-60579079,https://www.bbc.co.uk/news/uk-60579079?at_medi...,The parents of the Manchester Arena bombing's ...
4,Ukraine conflict: Oil price soars to highest l...,"Mon, 07 Mar 2022 08:15:53 GMT",https://www.bbc.co.uk/news/business-60642786,https://www.bbc.co.uk/news/business-60642786?a...,Consumers are feeling the impact of higher ene...


In [37]:
data = data.drop_duplicates(subset=['title']).drop_duplicates(subset=['guid']).drop_duplicates(subset=['description']).reset_index(drop=True)
data.head(5)

Unnamed: 0,title,pubDate,guid,link,description
0,Ukraine: Angry Zelensky vows to punish Russian...,"Mon, 07 Mar 2022 08:01:56 GMT",https://www.bbc.co.uk/news/world-europe-60638042,https://www.bbc.co.uk/news/world-europe-606380...,The Ukrainian president says the country will ...
1,War in Ukraine: Taking cover in a town under a...,"Sun, 06 Mar 2022 22:49:58 GMT",https://www.bbc.co.uk/news/world-europe-60641873,https://www.bbc.co.uk/news/world-europe-606418...,"Jeremy Bowen was on the frontline in Irpin, as..."
2,Ukraine war 'catastrophic for global food',"Mon, 07 Mar 2022 00:14:42 GMT",https://www.bbc.co.uk/news/business-60623941,https://www.bbc.co.uk/news/business-60623941?a...,One of the world's biggest fertiliser firms sa...
3,Manchester Arena bombing: Saffie Roussos's par...,"Mon, 07 Mar 2022 00:05:40 GMT",https://www.bbc.co.uk/news/uk-60579079,https://www.bbc.co.uk/news/uk-60579079?at_medi...,The parents of the Manchester Arena bombing's ...
4,Ukraine conflict: Oil price soars to highest l...,"Mon, 07 Mar 2022 08:15:53 GMT",https://www.bbc.co.uk/news/business-60642786,https://www.bbc.co.uk/news/business-60642786?a...,Consumers are feeling the impact of higher ene...


In [38]:
STOPWORDS = stopwords.words('english')
pd.Series(STOPWORDS)

0             i
1            me
2            my
3        myself
4            we
         ...   
174     weren't
175         won
176       won't
177      wouldn
178    wouldn't
Length: 179, dtype: object

In [39]:
data.loc[100, 'title']



In [40]:
lemmatizer = WordNetLemmatizer()
sent = data.loc[100, 'title']
pos_tag(word_tokenize(sent))

[('Cost', 'NN'),
 ('of', 'IN'),
 ('living', 'NN'),
 (':', ':'),
 ('UK', 'NNP'),
 ('faces', 'VBZ'),
 ('biggest', 'JJS'),
 ('income', 'NN'),
 ('squeeze', 'NN'),
 ('in', 'IN'),
 ('nearly', 'RB'),
 ('50', 'CD'),
 ('years', 'NNS')]

In [41]:
def transform(sentence):
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word, tag in pos_tag(word_tokenize(sentence)):
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
        if not wntag:
            lemma = word
        else:
            lemma = lemmatizer.lemmatize(word, wntag)
        lemmas.append(lemma)
    return ' '.join(lemmas)

In [74]:
def clear(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(f'[{re.escape(string.punctuation)}]', '', sentence)
    sentence = re.sub(r'\s+0+[0-9]*', '', sentence)
    return ' '.join(filter(lambda w: w not in STOPWORDS, nltk.tokenize.word_tokenize(sentence)))

In [75]:
def remove_artifacts(sentence):
    sentence = re.sub(r'\s*[0-9]+', '', sentence)
    return sentence

In [43]:
transform(clear(data.loc[100, 'title']))

'cost living warn uk face biggest income squeeze nearly 50 year'

In [44]:
doc = nlp(data.loc[100, 'title'])
[(X.text, X.label_) for X in doc.ents]

[('UK', 'GPE'), ('nearly 50 years', 'DATE')]

# Punctuation and stopwords

In [45]:
data['combined'] = data['title'] + ' ' + data['description']
data['combined']

0       Ukraine: Angry Zelensky vows to punish Russian...
1       War in Ukraine: Taking cover in a town under a...
2       Ukraine war 'catastrophic for global food' One...
3       Manchester Arena bombing: Saffie Roussos's par...
4       Ukraine conflict: Oil price soars to highest l...
                              ...                        
7359    Premiership: Harlequins 27-30 Saracens - Sarri...
7360    The queue - and the expert who helped plan it ...
7361    The cowboy and the Queen who bonded over horse...
7362    Queen amused by an American's breach of protoc...
7363    Queen Elizabeth II's special connection with W...
Name: combined, Length: 7364, dtype: object

In [46]:
data_ps = pd.DataFrame()
data_ps['combined'] = data['combined'].apply(clear)
data_ps

Unnamed: 0,combined
0,ukraine angry zelensky vows punish russian atr...
1,war ukraine taking cover town attack jeremy bo...
2,ukraine war catastrophic global food one world...
3,manchester arena bombing saffie roussoss paren...
4,ukraine conflict oil price soars highest level...
...,...
7359,premiership harlequins 2730 saracens sarries w...
7360,queue expert helped plan thousands people line...
7361,cowboy queen bonded horses sitting heads state...
7362,queen amused americans breach protocol monarch...


In [47]:
data_ps['combined'].apply(lambda x: len(x.split(' '))).describe()

count    7364.000000
mean       18.618957
std         4.170689
min         4.000000
25%        16.000000
50%        18.000000
75%        20.000000
max        38.000000
Name: combined, dtype: float64

In [48]:
data_ps.to_csv('./Data/updated/punctuation_stopwords.csv', index=False)

# Punctuation, stopwords and lemmatization

In [49]:
data_psl = pd.DataFrame()
data_psl['combined'] = data['combined'].apply(clear).apply(transform)
data_psl

Unnamed: 0,combined
0,ukraine angry zelensky vow punish russian atro...
1,war ukraine take cover town attack jeremy bowe...
2,ukraine war catastrophic global food one world...
3,manchester arena bomb saffie roussoss parent h...
4,ukraine conflict oil price soar highest level ...
...,...
7359,premiership harlequin 2730 saracen sarries win...
7360,queue expert help plan thousand people line mi...
7361,cowboy queen bond horse sit head state monday ...
7362,queen amuse american breach protocol monarch v...


In [50]:
data_psl.to_csv('./Data/updated/punctuation_stopwords_lemmatization.csv', index=False)

# Named Entity Extraction

In [76]:
data_nee = pd.DataFrame()
data_nee['combined'] = data['combined'].apply(lambda x: ' '.join([e.text for e in nlp(x).ents])).apply(remove_artifacts)
data_nee

Unnamed: 0,combined
0,Ukraine Angry Zelensky Russian Ukrainian
1,Ukraine Jeremy Bowen Irpin Russian
2,Ukraine
3,Manchester Arena Saffie Roussos's Manchester A...
4,Ukraine Consumers
...,...
7359,Saracens London Twickenham Stoop
7360,Thousands
7361,Monday Monty Roberts California
7362,American US Alice Frazier


In [77]:
data_nee = data_nee[data_nee['combined'].apply(lambda x: len(x.split(' '))) >= 2]
data_nee

Unnamed: 0,combined
0,Ukraine Angry Zelensky Russian Ukrainian
1,Ukraine Jeremy Bowen Irpin Russian
3,Manchester Arena Saffie Roussos's Manchester A...
4,Ukraine Consumers
5,Ukraine PM Boris Johnson Canadian Dutch
...,...
7358,Tottenham Son Heung-min six-minute-year-old
7359,Saracens London Twickenham Stoop
7361,Monday Monty Roberts California
7362,American US Alice Frazier


In [78]:
cv = CountVectorizer(analyzer='word')
cv_data = cv.fit_transform(data_nee['combined'])
dtm_data = pd.DataFrame(cv_data.toarray(), columns=cv.get_feature_names_out())
dtm_data.index = data_nee.index
dtm_data.head(5)

Unnamed: 0,aamir,aanoch,aardman,aaron,abba,abbey,abbie,abbington,abbott,abby,...,zookeepers,zoom,zsl,zuckerberg,zulu,zurich,zuu,zverev,zwelithini,úsuga
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [79]:
data_nee.to_csv('./data/updated/named_entity_extraction_2.csv', index=False)