## Lets play with Reuters collection in NLTK



In [2]:
from nltk.corpus import reuters

# List of document ids
documents = reuters.fileids()
print("Documents: {}".format(len(documents)))

# Train documents
train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
print("Total train documents: {}".format(len(train_docs_id)))

# Test documents
test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))
print("Total test documents: {}".format(len(test_docs_id)))

Documents: 10788
Total train documents: 7769
Total test documents: 3019


In [6]:
# Let's get a document with multiple labels
doc = 'training/9865'
print(reuters.raw(doc))

FRENCH FREE MARKET CEREAL EXPORT BIDS DETAILED
  French operators have requested licences
  to export 675,500 tonnes of maize, 245,000 tonnes of barley,
  22,000 tonnes of soft bread wheat and 20,000 tonnes of feed
  wheat at today's European Community tender, traders said.
      Rebates requested ranged from 127.75 to 132.50 European
  Currency Units a tonne for maize, 136.00 to 141.00 Ecus a tonne
  for barley and 134.25 to 141.81 Ecus for bread wheat, while
  rebates requested for feed wheat were 137.65 Ecus, they said.
  




In [7]:
print(reuters.categories(doc))

['barley', 'corn', 'grain', 'wheat']


In [10]:
from operator import itemgetter
from pprint import pprint

# List categories
categories = reuters.categories()
print("Number of categories: ", len(categories))

Number of categories:  90


In [15]:
# Document per category
category_dist = [(category, len(reuters.fileids(category))) for category in categories]
category_dist = sorted(category_dist, key=itemgetter(1), reverse=True)

print("Most common categories: ")
pprint(category_dist[-5:])

Most common categories: 
[('castor-oil', 2),
 ('groundnut-oil', 2),
 ('lin-oil', 2),
 ('rye', 2),
 ('sun-meal', 2)]


In [None]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

stop_words = stopwords.words("english")

train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
test_doc_id = list(filter(lambda doc: doc.startswith("test"), documents))

train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]

# Tokenize
vectorizer = TfidfVectorizer(stop_words = stop_words)

# Learn and transform train documents
vectorized_train_docs = vectorizer.fit_transform(train_docs)
vectorized_test_docs = vectorizer.transform(test_docs)

# Transform multi-labels labels
multilabelbin = MultiLabelBinarizer()
train_labels = multilabelbin.fit_transform([reuters.categories(doc_id)] for doc_id in train_docs_id)
test_labels = multilabelbin.transform([reuters.categories(doc_id)] for doc_id in test_docs_id)

# Classification
classifier = OneVsRestClassifier(LinearSVC(random_state=52)) #why this random stat