In [17]:
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from pathlib import Path

import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# spacy, textblob and nltk for language processing
from textblob import TextBlob, Word
import nltk
from nltk.stem.snowball import SnowballStemmer

# sklearn for feature extraction & modeling
from sklearn.feature_extraction.text import CountVectorizer

# download NLTK resources
nltk.download('punkt')
sns.set_style('white')

np.random.seed(42)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\19498\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [18]:
path = Path('data', 'bbc')
files = sorted(list(path.glob('**/*.txt')))
doc_list = []
for i, file in enumerate(files):
    topic = file.parts[-2]
    article = file.read_text(encoding='latin1').split('\n')
    heading = article[0].strip()
    body = ' '.join([l.strip() for l in article[1:]]).strip()
    doc_list.append([topic, heading, body])
docs = pd.DataFrame(doc_list, columns=['topic', 'heading', 'body'])
docs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   topic    2225 non-null   object
 1   heading  2225 non-null   object
 2   body     2225 non-null   object
dtypes: object(3)
memory usage: 52.3+ KB


In [19]:
# Select random article
article = docs.sample(1).squeeze()
parsed_body = TextBlob(article.body)

print(f'Topic:\t{article.topic.capitalize()}\n\n{article.heading}\n')
print(article.body.strip())

Topic:	Business

UK house prices dip in November

UK house prices dipped slightly in November, the Office of the Deputy Prime Minister (ODPM) has said.  The average house price fell marginally to Â£180,226, from Â£180,444 in October. Recent evidence has suggested that the UK housing market is slowing after interest rate increases, and economists forecast a drop in prices during 2005. But while the monthly figures may hint at a cooling of the market, annual house price inflation is still strong, up 13.8% in the year to November. Economists, however, forecast that ODPM figures are likely to show a weakening in annual house price growth in coming months. "Overall, the housing market activity is slowing down and that is backed up by the mortgage lending and the mortgage approvals data," said Mark Miller, at HBOS Treasury Services. "The ODPM data is a fairly lagging indicator."  The figures come after the Bank of England said the number of mortgages approved in the UK has fallen to the lowe

In [20]:
parsed_body.sentences

[Sentence("UK house prices dipped slightly in November, the Office of the Deputy Prime Minister (ODPM) has said."),
 Sentence("The average house price fell marginally to Â£180,226, from Â£180,444 in October."),
 Sentence("Recent evidence has suggested that the UK housing market is slowing after interest rate increases, and economists forecast a drop in prices during 2005."),
 Sentence("But while the monthly figures may hint at a cooling of the market, annual house price inflation is still strong, up 13.8% in the year to November."),
 Sentence("Economists, however, forecast that ODPM figures are likely to show a weakening in annual house price growth in coming months."),
 Sentence(""Overall, the housing market activity is slowing down and that is backed up by the mortgage lending and the mortgage approvals data," said Mark Miller, at HBOS Treasury Services."),
 Sentence(""The ODPM data is a fairly lagging indicator.""),
 Sentence("The figures come after the Bank of England said the numb

In [21]:
parsed_body.sentiment_assessments

Sentiment(polarity=0.10447845804988663, subjectivity=0.44258786848072557, assessments=[(['slightly'], -0.16666666666666666, 0.16666666666666666, None), (['average'], -0.15, 0.39999999999999997, None), (['recent'], 0.0, 0.25, None), (['strong'], 0.4333333333333333, 0.7333333333333333, None), (['likely'], 0.0, 1.0, None), (['overall'], 0.0, 0.0, None), (['down'], -0.15555555555555559, 0.2888888888888889, None), (['fairly'], 0.7, 0.9, None), (['nearly'], 0.1, 0.4, None), (['last'], 0.0, 0.06666666666666667, None), (['first'], 0.25, 0.3333333333333333, None), (['rose'], 0.6, 0.95, None), (['whole'], 0.2, 0.4, None), (['only'], 0.0, 1.0, None), (['second'], 0.0, 0.0, None), (['half'], -0.16666666666666666, 0.16666666666666666, None), (['overall'], 0.0, 0.0, None), (['large'], 0.21428571428571427, 0.42857142857142855, None), (['recent'], 0.0, 0.25, None), (['rose'], 0.6, 0.95, None), (['same'], 0.0, 0.125, None), (['average'], -0.15, 0.39999999999999997, None), (['more'], 0.5, 0.5, None), ([

In [22]:
# Stemming
stemmer = SnowballStemmer('english')

# stem each word
[(word, stemmer.stem(word)) for i, word in enumerate(parsed_body.words) if word.lower() != stemmer.stem(parsed_body.words[i])]

[('house', 'hous'),
 ('prices', 'price'),
 ('dipped', 'dip'),
 ('slightly', 'slight'),
 ('November', 'novemb'),
 ('Office', 'offic'),
 ('Deputy', 'deputi'),
 ('Minister', 'minist'),
 ('average', 'averag'),
 ('house', 'hous'),
 ('marginally', 'margin'),
 ('October', 'octob'),
 ('evidence', 'evid'),
 ('suggested', 'suggest'),
 ('housing', 'hous'),
 ('slowing', 'slow'),
 ('increases', 'increas'),
 ('economists', 'economist'),
 ('prices', 'price'),
 ('during', 'dure'),
 ('monthly', 'month'),
 ('figures', 'figur'),
 ('cooling', 'cool'),
 ('house', 'hous'),
 ('inflation', 'inflat'),
 ('November', 'novemb'),
 ('Economists', 'economist'),
 ('however', 'howev'),
 ('figures', 'figur'),
 ('likely', 'like'),
 ('weakening', 'weaken'),
 ('house', 'hous'),
 ('coming', 'come'),
 ('months', 'month'),
 ('Overall', 'overal'),
 ('housing', 'hous'),
 ('activity', 'activ'),
 ('slowing', 'slow'),
 ('backed', 'back'),
 ('mortgage', 'mortgag'),
 ('lending', 'lend'),
 ('mortgage', 'mortgag'),
 ('approvals', 'ap

In [25]:
# Lemmatization
import nltk
# nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\19498\AppData\Roaming\nltk_data...


True

In [27]:
[(word, word.lemmatize()) for i, word in enumerate(parsed_body.words) if word != parsed_body.words[i].lemmatize()]

[('prices', 'price'),
 ('has', 'ha'),
 ('has', 'ha'),
 ('increases', 'increase'),
 ('economists', 'economist'),
 ('prices', 'price'),
 ('figures', 'figure'),
 ('figures', 'figure'),
 ('months', 'month'),
 ('approvals', 'approval'),
 ('figures', 'figure'),
 ('mortgages', 'mortgage'),
 ('has', 'ha'),
 ('prices', 'price'),
 ('prices', 'price'),
 ('prices', 'price'),
 ('as', 'a'),
 ('gains', 'gain'),
 ('years', 'year'),
 ('prices', 'price'),
 ('houses', 'house'),
 ('flats', 'flat'),
 ('prices', 'price'),
 ('was', 'wa'),
 ('areas', 'area'),
 ('was', 'wa')]

In [28]:
[(word, word.lemmatize(pos='v')) for i, word in enumerate(parsed_body.words)
 if word != parsed_body.words[i].lemmatize(pos='v')]

[('prices', 'price'),
 ('dipped', 'dip'),
 ('has', 'have'),
 ('said', 'say'),
 ('has', 'have'),
 ('suggested', 'suggest'),
 ('housing', 'house'),
 ('is', 'be'),
 ('slowing', 'slow'),
 ('increases', 'increase'),
 ('prices', 'price'),
 ('figures', 'figure'),
 ('cooling', 'cool'),
 ('is', 'be'),
 ('figures', 'figure'),
 ('are', 'be'),
 ('weakening', 'weaken'),
 ('coming', 'come'),
 ('housing', 'house'),
 ('is', 'be'),
 ('slowing', 'slow'),
 ('is', 'be'),
 ('backed', 'back'),
 ('lending', 'lend'),
 ('said', 'say'),
 ('is', 'be'),
 ('lagging', 'lag'),
 ('figures', 'figure'),
 ('said', 'say'),
 ('mortgages', 'mortgage'),
 ('approved', 'approve'),
 ('has', 'have'),
 ('fallen', 'fall'),
 ('said', 'say'),
 ('prices', 'price'),
 ('increased', 'increase'),
 ('said', 'say'),
 ('prices', 'price'),
 ('rose', 'rise'),
 ('is', 'be'),
 ('predicting', 'predict'),
 ('prices', 'price'),
 ('stabilises', 'stabilise'),
 ('gains', 'gain'),
 ('attributed', 'attribute'),
 ('prices', 'price'),
 ('detached', 'det

In [31]:
# Sentiment & Polarity
parsed_body.sentiment_assessments

Sentiment(polarity=0.10447845804988663, subjectivity=0.44258786848072557, assessments=[(['slightly'], -0.16666666666666666, 0.16666666666666666, None), (['average'], -0.15, 0.39999999999999997, None), (['recent'], 0.0, 0.25, None), (['strong'], 0.4333333333333333, 0.7333333333333333, None), (['likely'], 0.0, 1.0, None), (['overall'], 0.0, 0.0, None), (['down'], -0.15555555555555559, 0.2888888888888889, None), (['fairly'], 0.7, 0.9, None), (['nearly'], 0.1, 0.4, None), (['last'], 0.0, 0.06666666666666667, None), (['first'], 0.25, 0.3333333333333333, None), (['rose'], 0.6, 0.95, None), (['whole'], 0.2, 0.4, None), (['only'], 0.0, 1.0, None), (['second'], 0.0, 0.0, None), (['half'], -0.16666666666666666, 0.16666666666666666, None), (['overall'], 0.0, 0.0, None), (['large'], 0.21428571428571427, 0.42857142857142855, None), (['recent'], 0.0, 0.25, None), (['rose'], 0.6, 0.95, None), (['same'], 0.0, 0.125, None), (['average'], -0.15, 0.39999999999999997, None), (['more'], 0.5, 0.5, None), ([