In [1]:
% matplotlib inline
import warnings
from pathlib import Path

import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# spacy, textblob and nltk for language processing
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer

# sklearn for feature extraction & modeling
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB         # Naive Bayes
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.externals import joblib

In [3]:
pd.set_option('float_format', '{:,.2f}'.format)

## Load BBC Data

In [4]:
path = Path('..', 'data', 'bbc')
files = path.glob('**/*.txt')
doc_list = []
for i, file in enumerate(files):
    topic = file.parts[-2]
    article = file.read_text(encoding='latin1').split('\n')
    heading = article[0].strip()
    body = ' '.join([l.strip() for l in article[1:]]).strip()
    doc_list.append([topic, heading, body])

In [5]:
docs = pd.DataFrame(doc_list, columns=['topic', 'heading', 'body'])
docs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 3 columns):
topic      2225 non-null object
heading    2225 non-null object
body       2225 non-null object
dtypes: object(3)
memory usage: 52.2+ KB


## Introduction to TextBlob

You should already have downloaded TextBlob, a Python library used to explore common NLP tasks.

### Select random article

In [15]:
article = docs.sample(1).squeeze()

In [16]:
print(f'Topic:\t{article.topic.capitalize()}\n\n{article.heading}\n')
print(article.body.strip())

Topic:	Entertainment

Lee to create new film superhero

Comic book veteran Stan Lee is to team up with producer Robert Evans to create a movie featuring a new superhero.  Foreverman will focus on a character who has to face problems in everyday life as well as using his special powers to save the world. Paramount Pictures, the studio behind the film, have revealed few details about the project but say it has the potential to spawn a series of films. Lee is best known for his work on Spider-Man and The Incredible Hulk.  He is collaborating on the script with screenwriter Peter Briggs, who penned the recent comic book adaptation Hellboy. "We believe it to be truly a whole new franchise," said Gill Champion, president and chief executive of Lee's POW! Entertainment. "In this world where people are looking for something different, Stan's idea was to create a concept not seen before to become an evergreen franchise for Paramount." Many of Lee's other creations, including X-Men and Daredevil

In [19]:
parsed_body = TextBlob(article.body)

### Tokenization

In [20]:
parsed_body.words

WordList(['Comic', 'book', 'veteran', 'Stan', 'Lee', 'is', 'to', 'team', 'up', 'with', 'producer', 'Robert', 'Evans', 'to', 'create', 'a', 'movie', 'featuring', 'a', 'new', 'superhero', 'Foreverman', 'will', 'focus', 'on', 'a', 'character', 'who', 'has', 'to', 'face', 'problems', 'in', 'everyday', 'life', 'as', 'well', 'as', 'using', 'his', 'special', 'powers', 'to', 'save', 'the', 'world', 'Paramount', 'Pictures', 'the', 'studio', 'behind', 'the', 'film', 'have', 'revealed', 'few', 'details', 'about', 'the', 'project', 'but', 'say', 'it', 'has', 'the', 'potential', 'to', 'spawn', 'a', 'series', 'of', 'films', 'Lee', 'is', 'best', 'known', 'for', 'his', 'work', 'on', 'Spider-Man', 'and', 'The', 'Incredible', 'Hulk', 'He', 'is', 'collaborating', 'on', 'the', 'script', 'with', 'screenwriter', 'Peter', 'Briggs', 'who', 'penned', 'the', 'recent', 'comic', 'book', 'adaptation', 'Hellboy', 'We', 'believe', 'it', 'to', 'be', 'truly', 'a', 'whole', 'new', 'franchise', 'said', 'Gill', 'Champion

### Sentence boundary detection

In [21]:
parsed_body.sentences

[Sentence("Comic book veteran Stan Lee is to team up with producer Robert Evans to create a movie featuring a new superhero."),
 Sentence("Foreverman will focus on a character who has to face problems in everyday life as well as using his special powers to save the world."),
 Sentence("Paramount Pictures, the studio behind the film, have revealed few details about the project but say it has the potential to spawn a series of films."),
 Sentence("Lee is best known for his work on Spider-Man and The Incredible Hulk."),
 Sentence("He is collaborating on the script with screenwriter Peter Briggs, who penned the recent comic book adaptation Hellboy."),
 Sentence(""We believe it to be truly a whole new franchise," said Gill Champion, president and chief executive of Lee's POW!"),
 Sentence("Entertainment."),
 Sentence(""In this world where people are looking for something different, Stan's idea was to create a concept not seen before to become an evergreen franchise for Paramount.""),
 Sente

### Stemming

In [36]:
# Initialize stemmer.
stemmer = SnowballStemmer('english')

# Stem each word.
[(word, stemmer.stem(word)) for i, word in enumerate(parsed_body.words) 
 if word.lower() != stemmer.stem(parsed_body.words[i])]

[('producer', 'produc'),
 ('Evans', 'evan'),
 ('create', 'creat'),
 ('movie', 'movi'),
 ('featuring', 'featur'),
 ('character', 'charact'),
 ('problems', 'problem'),
 ('using', 'use'),
 ('powers', 'power'),
 ('Pictures', 'pictur'),
 ('revealed', 'reveal'),
 ('details', 'detail'),
 ('potential', 'potenti'),
 ('series', 'seri'),
 ('films', 'film'),
 ('Incredible', 'incred'),
 ('collaborating', 'collabor'),
 ('screenwriter', 'screenwrit'),
 ('Briggs', 'brigg'),
 ('penned', 'pen'),
 ('adaptation', 'adapt'),
 ('believe', 'believ'),
 ('truly', 'truli'),
 ('franchise', 'franchis'),
 ('president', 'presid'),
 ('executive', 'execut'),
 ('Entertainment', 'entertain'),
 ('people', 'peopl'),
 ('looking', 'look'),
 ('something', 'someth'),
 ('different', 'differ'),
 ('create', 'creat'),
 ('before', 'befor'),
 ('become', 'becom'),
 ('franchise', 'franchis'),
 ('Many', 'mani'),
 ('creations', 'creation'),
 ('including', 'includ'),
 ('turned', 'turn'),
 ('films', 'film'),
 ('years', 'year'),
 ('Howeve

### Lemmatization

In [32]:
[(word, word.lemmatize()) for i, word in enumerate(parsed_body.words) 
 if word != parsed_body.words[i].lemmatize()]

[('has', 'ha'),
 ('problems', 'problem'),
 ('as', 'a'),
 ('as', 'a'),
 ('powers', 'power'),
 ('details', 'detail'),
 ('has', 'ha'),
 ('films', 'film'),
 ('was', 'wa'),
 ('creations', 'creation'),
 ('films', 'film'),
 ('years', 'year'),
 ('has', 'ha'),
 ('its', 'it'),
 ('cinemas', 'cinema')]

Lemmatization relies on parts-of-speech (POS) tagging; `spaCy` performs POS tagging, here we make assumptions, e.g. that each token is verb.

In [34]:
[(word, word.lemmatize(pos='v')) for i, word in enumerate(parsed_body.words) 
 if word != parsed_body.words[i].lemmatize(pos='v')]

[('is', 'be'),
 ('featuring', 'feature'),
 ('has', 'have'),
 ('using', 'use'),
 ('powers', 'power'),
 ('revealed', 'reveal'),
 ('details', 'detail'),
 ('has', 'have'),
 ('films', 'film'),
 ('is', 'be'),
 ('known', 'know'),
 ('is', 'be'),
 ('collaborating', 'collaborate'),
 ('penned', 'pen'),
 ('said', 'say'),
 ('are', 'be'),
 ('looking', 'look'),
 ('was', 'be'),
 ('seen', 'see'),
 ('including', 'include'),
 ('been', 'be'),
 ('turned', 'turn'),
 ('films', 'film'),
 ('has', 'have'),
 ('been', 'be'),
 ('taking', 'take'),
 ('is', 'be'),
 ('scheduled', 'schedule'),
 ('released', 'release')]

### Combine Textblob Lemmatization with `CountVectorizer`

In [39]:
def lemmatizer(text):
    words = TextBlob(text.lower()).words
    return [word.lemmatize() for word in words]

In [40]:
vectorizer = CountVectorizer(analyzer=lemmatizer, decode_error='replace')