In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from tqdm import tqdm

from collections import Counter
import re
import spacy
nlp = spacy.load('en_core_web_md')

In [2]:
# fix bug: https://github.com/explosion/spaCy/issues/922
nlp.vocab.add_flag(lambda s: s.lower() in spacy.en.word_sets.STOP_WORDS, spacy.attrs.IS_STOP);

# Load data

In [3]:
dataset = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

docs = dataset['data']
targets = dataset['target']
target_names = dataset['target_names']

# number of documents
len(docs)

18846

# Show some documents

In [4]:
print(docs[0])



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!




In [5]:
print(docs[1])

My brother is in the market for a high-performance video card that supports
VESA local bus with 1-2MB RAM.  Does anyone have suggestions/ideas on:

  - Diamond Stealth Pro Local Bus

  - Orchid Farenheit 1280

  - ATI Graphics Ultra Pro

  - Any other high-performance VLB card


Please post or email.  Thank you!

  - Matt



In [6]:
print(target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [7]:
print(targets)

[10  3 17 ...,  3  1  7]


# Clean documents

In [8]:
spaces = re.compile(r' +')
nonletters = re.compile(r'[^a-z A-Z]+')
shortwords = re.compile(r'\b\w\b')

def clean(doc):
    text = doc
    text = re.sub(nonletters, ' ', text)
    text = re.sub(shortwords, ' ', text)
    text = re.sub(spaces, ' ', text).strip()
    text = nlp(text, tag=True, parse=False, entity=False)
    return [t.lemma_ for t in text if not t.is_stop]

In [9]:
tokens = []
preprocessed_docs = []

for doc in tqdm(docs):
    words = clean(doc)
    preprocessed_docs += [words]
    tokens += words

term_counts = Counter(tokens)

# number of unique tokens
len(term_counts)

100%|██████████| 18846/18846 [00:34<00:00, 547.69it/s]


85608

# Remove rare words

In [10]:
non_rare = [x for x in term_counts if term_counts[x] >= 10]
len(non_rare)

13812

In [11]:
temp = []
for doc in tqdm(preprocessed_docs):
    temp.append([w for w in doc if w in non_rare])

100%|██████████| 18846/18846 [02:49<00:00, 111.23it/s]


In [12]:
# only keep documents with more than 10 words
to_keep = [i for i, doc in enumerate(temp) if len(doc) > 10]
len(to_keep)

16116

In [13]:
preprocessed_docs = [temp[i] for i in to_keep]
targets = targets[to_keep]

# Show some preprocessed documents

In [14]:
print(preprocessed_docs[0])

['sure', 'pens', 'fan', 'pretty', 'confused', 'lack', 'kind', 'post', 'recent', 'pens', 'massacre', 'devil', 'actually', 'bit', 'bit', 'go', 'end', 'non', 'relief', 'bit', 'praise', 'pens', 'man', 'kill', 'devil', 'bad', 'think', 'jagr', 'show', 'good', 'regular', 'season', 'stat', 'lot', 'fo', 'fun', 'watch', 'playoff', 'bowman', 'let', 'jagr', 'lot', 'fun', 'couple', 'game', 'pens', 'go', 'beat', 'jersey', 'disappointed', 'islanders', 'lose', 'final', 'regular', 'season', 'game', 'pens', 'rule']


In [15]:
print(preprocessed_docs[1])

['brother', 'market', 'high', 'performance', 'video', 'card', 'support', 'vesa', 'local', 'bus', 'mb', 'ram', 'suggestion', 'idea', 'diamond', 'stealth', 'pro', 'local', 'bus', 'orchid', 'ati', 'graphics', 'ultra', 'pro', 'high', 'performance', 'vlb', 'card', 'post', 'email', 'thank', 'matt']


# Save

In [16]:
docs_as_strings = [' '.join(doc) for doc in preprocessed_docs]

In [17]:
D = pd.DataFrame({'docs': docs_as_strings, 'targets': targets})

In [18]:
# convert targets to human readable format
target_dict = {i: n for i, n in enumerate(target_names)}
D['targets'] = D['targets'].apply(lambda x: target_dict[x])

In [19]:
D.to_hdf('data.hdf', 'data', mode='w')