In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from pathlib import Path
from jupyterthemes import jtplot

For installing and working with `spacy`, see: https://spacy.io/usage/models#section-quickstart

In [None]:
% matplotlib inline
pd.options.display.float_format = '{:,.2f}'.format
jtplot.style(theme='onedork', context='talk', fscale=1.4, spines=False, gridlines='--', ticks=True, grid=False, figsize=(14, 8))

### Load BBC data

In [None]:
path = Path('bbc')
files = path.glob('**/*.txt')
doc_list = []
for i, file in enumerate(files):
    with open(str(file), encoding='latin1') as f:
        _, topic, file_name = file.parts

        lines = f.readlines()
        file_id = file_name.split('.')[0]
        heading = lines[0].strip()
        body = ' '.join([l.strip() for l in lines[1:]])
        doc_list.append([topic, heading, body])

### Convert to DataFrame

In [None]:
docs = pd.DataFrame(doc_list, columns=['topic', 'heading', 'article'])
docs['word count'] = docs.article.str.split().str.len()
docs.info()

### Inspect results

In [None]:
docs.sample(10)

In [None]:
# word count
docs.article.str.split().str.len().sum()

### Data drawn from 5 different categories

In [None]:
docs.topic.value_counts(normalize=True)

### Prepare Document-Term Matrix

In [None]:
print(CountVectorizer().__doc__)

### Baseline document-term matrix

In [None]:
# Baseline: number of unique tokens
vectorizer = CountVectorizer()
doc_term_matrix = vectorizer.fit_transform(docs.article)
doc_term_matrix

In [None]:
doc_term_matrix.shape

### Inspect tokens

In [None]:
# vectorizer keeps words
words = vectorizer.get_feature_names()
words[:10]

### Inspect doc-term matrix

In [None]:
# from scipy compressed sparse row matrix to sparse DataFrame
doc_term_matrix_df = pd.SparseDataFrame(doc_term_matrix, columns=words)
doc_term_matrix_df.head()

### Most frequent words

In [None]:
word_freq = doc_term_matrix_df.sum(axis=0).astype(int)
word_freq.sort_values(ascending=False).head() 

### Get relative word frequency

In [None]:
vectorizer = CountVectorizer(binary=True)
doc_term_matrix = vectorizer.fit_transform(docs.article)
doc_term_matrix.shape

In [None]:
words = vectorizer.get_feature_names()

word_freq = doc_term_matrix.sum(axis=0)
word_freq_1d = np.squeeze(np.asarray(word_freq))
pd.Series(word_freq_1d, index=words).div(
    docs.shape[0]).sort_values(ascending=False).head(10)

### Visualize Doc-Term Matrix

In [None]:
sns.heatmap(pd.DataFrame(doc_term_matrix.todense(), columns=words), cmap='Blues');

### Using thresholds to reduce the number of tokens 

In [None]:
vectorizer = CountVectorizer(max_df=.2, min_df=3, stop_words='english')
doc_term_matrix = vectorizer.fit_transform(docs.article)
doc_term_matrix.shape

### Use CountVectorizer with Lemmatization

#### Building a custom `tokenizer` for Lemmatization with `spacy`

In [None]:
# run the below if you have not yet installed Spacy's English language model (shell command)
# https://spacy.io/usage/models
!python -m spacy download en

In [None]:
nlp = spacy.load('en')
def tokenizer(doc):
    return [w.lemma_ for w in nlp(doc) 
                if not w.is_punct | w.is_space]

In [None]:
vectorizer = CountVectorizer(tokenizer=tokenizer, binary=True)
doc_term_matrix = vectorizer.fit_transform(docs.article)
doc_term_matrix.shape

In [None]:
lemmatized_words = vectorizer.get_feature_names()
word_freq = doc_term_matrix.sum(axis=0)
word_freq_1d = np.squeeze(np.asarray(word_freq))
word_freq_1d = pd.Series(word_freq_1d, index=lemmatized_words).div(docs.shape[0])
word_freq_1d.sort_values().tail(20)

### TfIdf Vectorizer 

In [None]:
print(TfidfTransformer().__doc__)

In [None]:
tfidf = TfidfVectorizer()
doc_term_matrix = tfidf.fit_transform(docs.article)
doc_term_matrix.shape

In [None]:
words = tfidf.get_feature_names()
word_freq = doc_term_matrix.sum(axis=0)
word_freq_1d = np.squeeze(np.asarray(word_freq))
word_freq_1d = pd.Series(word_freq_1d, index=words)
word_freq_1d.sort_values().tail(20) 