In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from tqdm import tqdm

import matplotlib.pyplot as plt
%matplotlib inline

from collections import Counter
import re
import spacy
nlp = spacy.load('en')

# Load data

In [2]:
dataset = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

docs = dataset['data']
targets = dataset['target']
target_names = dataset['target_names']

# number of documents
len(docs)

18846

# Show some documents

In [4]:
print(target_names)print(docs[0])



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!




In [5]:
print(docs[1])

My brother is in the market for a high-performance video card that supports
VESA local bus with 1-2MB RAM.  Does anyone have suggestions/ideas on:

  - Diamond Stealth Pro Local Bus

  - Orchid Farenheit 1280

  - ATI Graphics Ultra Pro

  - Any other high-performance VLB card


Please post or email.  Thank you!

  - Matt



In [13]:
print(target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [6]:
print(targets)

[10  3 17 ...,  3  1  7]


# Clean documents

In [7]:
spaces = re.compile(r' +')
nonletters = re.compile(r'[^a-z A-Z]+')
shortwords = re.compile(r'\b\w\b')

def clean(doc):
    text = doc
    text = re.sub(nonletters, ' ', text)
    text = re.sub(shortwords, ' ', text)
    text = re.sub(spaces, ' ', text).strip()
    text = nlp(text, tag=True, parse=False, entity=False)
    return [t.lemma_ for t in text if not t.is_stop]

In [8]:
tokens = []
preprocessed_docs = []

for doc in tqdm(docs):
    words = clean(doc)
    preprocessed_docs += [words]
    tokens += words

term_counts = Counter(tokens)

# number of unique tokens
len(term_counts)

100%|██████████| 18846/18846 [00:34<00:00, 552.77it/s]


85734

# Remove rare words

In [10]:
non_rare = [x for x in term_counts if term_counts[x] >= 10]
len(non_rare)

13831

In [14]:
temp = []
for doc in tqdm(preprocessed_docs):
    temp.append([w for w in doc if w in non_rare])

100%|██████████| 18846/18846 [02:56<00:00, 106.66it/s]


In [15]:
# only keep documents with more than 10 words
to_keep = [i for i, doc in enumerate(temp) if len(doc) > 10]
len(to_keep)

16121

In [16]:
preprocessed_docs = [temp[i] for i in to_keep]
targets = targets[to_keep]

# Save

In [19]:
docs_as_strings = [' '.join(doc) for doc in preprocessed_docs]

In [20]:
D = pd.DataFrame({'docs': docs_as_strings, 'targets': targets})

In [21]:
# convert targets to human readable format
target_dict = {i: n for i, n in enumerate(target_names)}
D['targets'] = D['targets'].apply(lambda x: target_dict[x])

In [24]:
D.to_hdf('data.hdf', 'data', mode='w')