In [44]:
# --> reuters.readme()
#
# This is a publically available version of the well-known Reuters-21578
# "ApteMod" corpus for text categorization. 
# ApteMod is a collection of 10,788 documents from the Reuters financial
# newswire service, partitioned into a training set with 7769 documents
# and a test set with 3019 documents. 
# 
# The distribution of categories in the ApteMod corpus is highly skewed,
# with 36.7% of the documents in the most common category, and only
# 0.0185% (2 documents) in each of the five least common categories.
# In fact, the original data source is even more skewed---in creating
# the corpus, any categories that did not contain at least one document
# in the training set and one document in the test set were removed from
# the corpus by its original creator.
# 
# In the ApteMod corpus, each document belongs to one or more
# categories.  There are 90 categories in the corpus.  The average
# number of categories per document is 1.235, and the average number of
# documents per category is about 148, or 1.37% of the corpus.

In [45]:
import random
import nltk

from nltk.corpus import reuters
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
import re
from nltk.corpus import stopwords

nltk.download('reuters')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\gogu\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gogu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gogu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [46]:
def count_data(data, training=0, test=0):
    for idx, item in enumerate(data):
        if item.split('/')[0] == 'test':
            test += 1
        else:
            training += 1
    print('Training set = {}, Test set = {}'.format(training, test))

In [47]:
# confirm dataset size

count_data(reuters.fileids())

Training set = 7769, Test set = 3019


In [48]:
# confirm number of categories

categories = reuters.categories()
print('Categories = {}'.format(categories))
print('----------------------------------')
print('Random category = {}'.format(categories[random.randint(0, len(categories))]))
print('----------------------------------')
print('Number of categories = {}'.format(len(categories)))

Categories = ['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']
----------------------------------
Random category = dfl
--------------------

In [49]:
# select random category,
# show documents for that category,
# show words in the first document from the set of documents for that category

rnd_category = categories[random.randint(1,len(categories))]
category_doc = reuters.fileids(rnd_category)

print('Category: {}'.format(rnd_category))
print('----------------------------------')
print('Documents containing category = {}: {}'.format(rnd_category, category_doc))
print('----------------------------------')
print('Words related to category {}: {}'.format(rnd_category, reuters.words(category_doc[0])))

Category: strategic-metal
----------------------------------
Documents containing category = strategic-metal: ['test/15420', 'test/15838', 'test/15872', 'test/17480', 'test/17486', 'test/17783', 'test/17805', 'test/18348', 'test/18466', 'test/18872', 'test/18944', 'training/10151', 'training/11999', 'training/12007', 'training/13052', 'training/13251', 'training/14719', 'training/2186', 'training/2936', 'training/2942', 'training/3010', 'training/309', 'training/346', 'training/3460', 'training/3497', 'training/5693', 'training/7775']
----------------------------------
Words related to category strategic-metal: ['JOHNSON', 'MATTHEY', "'", 'S', 'PLATINUM', 'GROUP', ...]


In [50]:
# split documents

train_doc = [doc for doc in reuters.fileids() if doc.startswith('train')]
test_doc = [doc for doc in reuters.fileids() if doc.startswith('test')]
print(len(train_doc), len(test_doc))

7769 3019


In [51]:
# text pre-processing
# tokenization, stemming, stop-words clean up

stop_words = stopwords.words('english')

def token(doc):
    token_min_lenght = 2
    words = map(lambda word: word.lower(), word_tokenize(doc))
    words = [word for word in words if word not in stop_words]
    tokens = (list(map(lambda token: PorterStemmer().stem(token), words)))
    p = re.compile('[a-zA-Z]+');
    filtered_tokens = list(filter(lambda token: p.match(token) and 
                                  len(token)>= token_min_lenght, tokens))
    return filtered_tokens

In [52]:
# show raw text

print(reuters.raw(category_doc))
text = reuters.raw(category_doc)

JOHNSON MATTHEY'S PLATINUM GROUP PRICES
  Johnson Matthey today issued the
  following Platinum group base prices (unfabricated), all U.S.
  Dlrs per troy ounce.
   Previous prices in parentheses.
   PLATINUM  -   562   (567)
   PALLADIUM -   130   (130)
   IRIDIUM   -   400   (400)
   RHODIUM     1,230 (1,230)
   RUTHENIUM -    80    (80)
  

HOUSE PANEL WANTS PENTAGON MANAGE U.S. STOCKPILE
  The House Armed Services Committee
  has voted for a transfer in the management of stockpiled
  materials for national defense to the U.S. Defense Secretary.
      The committee also voted for legally-binding quantity and
  quality requirements on the materials, mostly metals.
      The measures are a part of the Defense Authorization Bill
  which will be voted on in the House next month.
      The purpose of the measures, passed by the committee
  yesterday, is to improve stockpile management and discourage
  sell-offs of materials that could jeopardize strategic needs, a
  staff member of the c

In [53]:
print(token(text))

['johnson', 'matthey', 'platinum', 'group', 'price', 'johnson', 'matthey', 'today', 'issu', 'follow', 'platinum', 'group', 'base', 'price', 'unfabr', 'u.s.', 'dlr', 'per', 'troy', 'ounc', 'previou', 'price', 'parenthes', 'platinum', 'palladium', 'iridium', 'rhodium', 'ruthenium', 'hous', 'panel', 'want', 'pentagon', 'manag', 'u.', 'stockpil', 'hous', 'arm', 'servic', 'committe', 'vote', 'transfer', 'manag', 'stockpil', 'materi', 'nation', 'defens', 'u.s.', 'defens', 'secretari', 'committe', 'also', 'vote', 'legally-bind', 'quantiti', 'qualiti', 'requir', 'materi', 'mostli', 'metal', 'measur', 'part', 'defens', 'author', 'bill', 'vote', 'hous', 'next', 'month', 'purpos', 'measur', 'pass', 'committe', 'yesterday', 'improv', 'stockpil', 'manag', 'discourag', 'sell-off', 'materi', 'could', 'jeopard', 'strateg', 'need', 'staff', 'member', 'committe', 'told', 'reuter', 'stockpil', 'stand', 'gain', 'manag', 'properli', 'said', 'staffer', 'manag', 'stockpil', 'last', 'year', 'atroci', 'respons