In [1]:
from __future__ import print_function

import time
import re
import os.path
import fnmatch
import sgmllib
import urllib
import tarfile

import numpy as np
import pylab as pl

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.stochastic_gradient import SGDClassifier
import itertools

from pyspark import SparkContext

In [2]:
def _not_in_sphinx():
    return '__file__' in globals()
class ReutersParser(sgmllib.SGMLParser):
    def __init__(self, verbose=0):
        sgmllib.SGMLParser.__init__(self, verbose)
        self._reset()

    def _reset(self):
        self.in_title = 0
        self.in_body = 0
        self.in_topics = 0
        self.in_topic_d = 0
        self.title = ""
        self.body = ""
        self.topics = []
        self.topic_d = ""

    def parse(self, fd):
        self.docs = []
        for chunk in fd:
            self.feed(chunk)
            for doc in self.docs:
                yield doc
            self.docs = []
        self.close()

    def handle_data(self, data):
        if self.in_body:
            self.body += data
        elif self.in_title:
            self.title += data
        elif self.in_topic_d:
            self.topic_d += data

    def start_reuters(self, attributes):
        pass

    def end_reuters(self):
        self.body = re.sub(r'\s+', r' ', self.body)
        self.docs.append({'title': self.title,
                          'body': self.body,
                          'topics': self.topics})
        self._reset()

    def start_title(self, attributes):
        self.in_title = 1

    def end_title(self):
        self.in_title = 0

    def start_body(self, attributes):
        self.in_body = 1

    def end_body(self):
        self.in_body = 0

    def start_topics(self, attributes):
        self.in_topics = 1

    def end_topics(self):
        self.in_topics = 0

    def start_d(self, attributes):
        self.in_topic_d = 1

    def end_d(self):
        self.in_topic_d = 0
        self.topics.append(self.topic_d)
        self.topic_d = ""


In [3]:
class ReutersStreamReader():

    DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/'
                    'reuters21578-mld/reuters21578.tar.gz')
    ARCHIVE_FILENAME = 'reuters21578.tar.gz'

    def __init__(self, data_path):
        self.data_path = data_path
        if not os.path.exists(self.data_path):
            self.download_dataset()

    def download_dataset(self):
        """Download the dataset."""
        print("downloading dataset (once and for all) into %s" %
              self.data_path)
        os.mkdir(self.data_path)

        def progress(blocknum, bs, size):
            total_sz_mb = '%.2f MB' % (size / 1e6)
            current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
            if _not_in_sphinx():
                print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb),
                      end='')
        urllib.urlretrieve(self.DOWNLOAD_URL,
                           filename=os.path.join(self.data_path,
                                                 self.ARCHIVE_FILENAME),
                           reporthook=progress)
        if _not_in_sphinx():
            print('\r', end='')
        print("untaring data ...")
        tfile = tarfile.open(os.path.join(self.data_path,
                                          self.ARCHIVE_FILENAME),
                             'r:gz')
        tfile.extractall(self.data_path)
        print("done !")

    def iterdocs(self):
        """Iterate doc by doc, yield a dict."""
        for root, _dirnames, filenames in os.walk(self.data_path):
            for filename in fnmatch.filter(filenames, '*.sgm'):
                path = os.path.join(root, filename)
                parser = ReutersParser()
                for doc in parser.parse(open(path)):
                    yield doc

In [4]:
hasher = HashingVectorizer(decode_error='ignore', n_features=2 ** 18)
tfidf = TfidfVectorizer( min_df=3,
                        max_df=0.90, max_features=3000,
                        use_idf=True, sublinear_tf=True,
                        norm='l2');

classifier = SGDClassifier()

data_streamer = ReutersStreamReader('reuters').iterdocs()

all_classes = np.array([0, 1])
positive_class = ['acq', 'corn', 'crude', 'earn', 'grain', 'interest', 'money-fx', 'ship', 'trade', 'wheat']

In [5]:
temp = sc.parallelize(data_streamer)

In [6]:
temp.take(3)

[{'body': '',
  'title': "TREASURY'S BAKER SAYS MACROECONOMIC INDICATORS NEED MORE PROMINENT ROLE\n",
  'topics': ['james-baker']},
 {'body': '',
  'title': 'HOSPITAL CORP SAYS IT RECEIVED 47 DLR A SHARE OFFER FROM INVESTOR GROUP\n',
  'topics': ['acq']},
 {'body': 'Qtly div five cts vs five cts prior Pay July 13 Record June 30 Reuter \x03',
  'title': 'BEVERLY ENTERPRISES <BEV> SETS REGULAR DIVIDEND',
  'topics': ['earn', 'usa']}]

In [7]:
test = temp.filter(lambda s: s['topics'] == [])

In [8]:
test = test.map(lambda s: s['title']+" "+s['body'])

In [9]:
test.take(3)

['GENERAL NUTRITION FILES FOR SECONDARY OFFERING OF EIGHT MLN COMMON SHARES\n ',
 'TEXACO NOT REQUIRED TO POST BOND IN APPEAL OF PENNZOIL JUDGMENT, COURT SAYS\n ',
 "MOODY'S MAY DOWNGRADE IRVING BANK CORP, AFFECTS 950 MLN DLRS OF DEBT\n "]

In [10]:
len(list(test.collect()))

1862

In [11]:
test_data = tuple(test.collect())

In [12]:
test = hasher.transform(test_data)

In [38]:
stats = {'predict': 0.0}

In [55]:
'''def aux(topics):
    y_train = []
    for i in positive_class:
        y_train.append(i in topics)
    return np.asarray(y_train)'''

'def aux(topics):\n    y_train = []\n    for i in positive_class:\n        y_train.append(i in topics)\n    return np.asarray(y_train)'

In [39]:
def get_minibatch(doc_iter, size, pos_class, transformer=hasher):

    data = [('{title}\n\n{body}'.format(**doc), pos_class in doc['topics'])
            for doc in itertools.islice(doc_iter, size)
            if doc['topics']]
    
    if not len(data):
        return np.asarray([], dtype=int), np.asarray([], dtype=int)
    X, y = zip(*data)
    return transformer.transform(X), np.asarray(y, dtype=int)

In [40]:
def iter_minibatchs(doc_iter, minibatch_size, pos_class):

    X, y = get_minibatch(doc_iter, minibatch_size, pos_class)
    while X.shape[0]:
        yield X, y
        X, y = get_minibatch(doc_iter, minibatch_size, pos_class)

In [41]:
data_streamer = ReutersStreamReader('reuters').iterdocs()
minibatch_size = 100
minibatch_iterators = iter_minibatchs(data_streamer, minibatch_size, positive_class[0])

In [42]:
def learn(classifier, stats, (X_train, y_train)):
    classifier.partial_fit(X_train, y_train, classes=all_classes)
    stats['predict'] = classifier.predict(test)
    return classifier, stats

In [43]:
from sklearn.base import copy
def merge((cf1, stats1), (cf2, stats2)):
    new = copy.deepcopy(cf1)
    new.coef_ += cf2.coef_
    new.intercept_ += cf2.intercept_
    return new, stats1

In [44]:
rdd = sc.parallelize(minibatch_iterators)

In [45]:
rdd.take(2)

[(<100x262144 sparse matrix of type '<type 'numpy.float64'>'
  	with 8551 stored elements in Compressed Sparse Row format>,
  array([0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
         0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
         0, 0, 0, 0, 0, 0, 0, 0])),
 (<100x262144 sparse matrix of type '<type 'numpy.float64'>'
  	with 10876 stored elements in Compressed Sparse Row format>,
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
         0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]))]

In [46]:
rdd1 = rdd.map(lambda batch: learn(classifier, stats, batch))

In [47]:
rdd1.first()

(SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
        eta0=0.0, fit_intercept=True, l1_ratio=0.15,
        learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
        penalty='l2', power_t=0.5, random_state=None, shuffle=True,
        verbose=0, warm_start=False),
 {'predict': array([0, 0, 0, ..., 0, 0, 1])})

In [48]:
model, stats = rdd1.reduce(lambda l, r: merge(l, r))

In [49]:
c = 0
for i in range(len(test_data)):
    if(stats['predict'][i] == 1):
        c = c + 1
        print(test_data[i])

ALLIED-SIGNAL SAID IT SOLD MPB CORP FOR 145 MLN DLRS, ASSUMPTION OF DEBT
 
MERRILL LYNCH SAYS IT FIRED NAHUM VASKEVITCH
 
 ALUSUISSE SAYS IT PLANS 50 PCT CAPITAL CUT
 
CHRYSLER SAYS IT SOLD 405 MLN DLRS OF CORPORATE BONDS TO INVESTMENT BANKERS
 
AMERICAN EXPRESS SAYS IT IS CONFIDENT OF ITS OPERATIONS
 
CHASE MANHATTAN SAID IT BOUGHT ATLANTIC FINANCIAL FEDERAL PORTFOLIO FOR 370 MLN DLRS
 
CHRYSLER CORP SAID IT IS ACCELERATING ITS STOCK REPURCHASE PROGRAM
 
ALLEGIS SAID IT IS CREATING A LIMITED PARTNERSHIP TO SELL SOME CANADIAN HOTELS
 
ALLEGIS SAID IT SEES PROCEEDS OF 350 MLN CANADIAN DLRS WHEN HOTELS ARE SOLD
 
HOSPITAL CORP OF AMERICA <HCA> GETS MERGER BID Hospital Corp of America said it received a letter today from an investor group offering to buy the company for 47 dlrs a share. Based on 82 mln outstanding shares, the offer is worth about 3.85 billion dlrs. But the company said that it would take more than five billion dlrs to consummate the merger and retire certain company debt.

In [55]:
result = []
for i in positive_class:
    data_streamer = ReutersStreamReader('reuters').iterdocs()
    minibatch_size = 100
    minibatch_iterators = iter_minibatchs(data_streamer, minibatch_size, i)
    model, stats = sc.parallelize(minibatch_iterators).map(lambda batch: learn(classifier, stats, batch)).reduce(lambda l, r: merge(l, r))
    result.append(stats)

In [None]:
##print all acq

In [56]:
for i in range(len(test_data)):
    if(result[0]['predict'][i] == 1):
        print(test_data[i])

ALLIED-SIGNAL SAID IT SOLD MPB CORP FOR 145 MLN DLRS, ASSUMPTION OF DEBT
 
MERRILL LYNCH SAYS IT FIRED NAHUM VASKEVITCH
 
 ALUSUISSE SAYS IT PLANS 50 PCT CAPITAL CUT
 
CHRYSLER SAYS IT SOLD 405 MLN DLRS OF CORPORATE BONDS TO INVESTMENT BANKERS
 
AMERICAN EXPRESS SAYS IT IS CONFIDENT OF ITS OPERATIONS
 
CHASE MANHATTAN SAID IT BOUGHT ATLANTIC FINANCIAL FEDERAL PORTFOLIO FOR 370 MLN DLRS
 
CHRYSLER CORP SAID IT IS ACCELERATING ITS STOCK REPURCHASE PROGRAM
 
ALLEGIS SAID IT IS CREATING A LIMITED PARTNERSHIP TO SELL SOME CANADIAN HOTELS
 
ALLEGIS SAID IT SEES PROCEEDS OF 350 MLN CANADIAN DLRS WHEN HOTELS ARE SOLD
 
HOSPITAL CORP OF AMERICA <HCA> GETS MERGER BID Hospital Corp of America said it received a letter today from an investor group offering to buy the company for 47 dlrs a share. Based on 82 mln outstanding shares, the offer is worth about 3.85 billion dlrs. But the company said that it would take more than five billion dlrs to consummate the merger and retire certain company debt.

In [57]:
for i in range(len(test_data)):
    if(result[1]['predict'][i] == 1):
        print(test_data[i])

KANSAS CITY SHIPMENTS AND RECEIPTS in bushels Truck Receipts Wheat 73,684 Corn 17,979 Grain Sorghum 4,391 Soybns 72,180 Truck Shipments Corn 8,000 Barge Shipments - None Reuter 
CBT CORN FUTURES HOLD GAINS AT MIDSESSION CBT corn futures held gains of two to 2-3/4 cents per bushel in moderately light midsession trade. Yesterday's sharp jump in the weekly corn export figure to about three times last year's weekly inspections, plus the outlook for a continued active export pace due to the decline in the U.S. dollar this year, supported nearbys, traders said. Diminished PIK-and-roll activity, compared with the active pace seen last week, added to the more positive tone. Most dealers expect the contract lows in corn futures to hold, with some looking for the improving chart picture to support a further advance. A local house, Cargill, the Illinois Co-op, Refco and ADM were the featured buyers of old crop May, with Drexel/Burnham a steady buyer in new crop December, pit brokers said. Reuter

In [58]:
for i in range(len(test_data)):
    if(result[2]['predict'][i] == 1):
        print(test_data[i])

U.S. ENERGY FUTURES STEADY BUT QUIET U.S. energy futures traded consistently above yesterday's closing prices in thin activity this morning with underlying support from April 15-day forward North Sea Brent crude, traders said. April Brent traded as high as 19.40 dlrs a barrel today, or more than 1.00 dlr above May Brent prices, because of a supply squeeze, according to traders. "U.S. energy futures are probably influenced by April Brent more than anything else today," said John O'Connell, assistant vice president at Refco, Inc. May crude was up 18 cts to 18.85 dlrs a barrel. "There is no reason to sell energy futures at these prices because the fundamentals have not changed," said O'Connell, adding that the OPEC pricing/production accord continues to hold. Mixed trade participants dominated crude futures while local traders were featured in products, traders said. They said U.S. energy futures ran into resistance at today's highs. May heating oil was up 0.46 cent at 49.25 cts a gallon 

In [59]:
for i in range(len(test_data)):
    if(result[3]['predict'][i] == 1):
        print(test_data[i])

Toshiba group net 34.18 billion yen (59.44 billion) year to March 31
 
WHITTAKER CORP 2nd QTR SHR PROFIT 37 CTS VS LOSS 35 CTS
 
SECURITY PACIFIC EXPECTS 175 MLN DLR LOSS IN QTR AS RESULT OF LOSS PROVISION
 
Fujitsu Ltd group net profit 21.61 billion yen vs 38.93 billion (year to Mar 31)
 
S/P AFFIRMS W.R. GRACE AND CO'S 575 MLN DLRS OF DEBT AFTER ITS 4TH-QTR LOSS
 
K MART SEES 1987 SALES OF 26 BILLION DLRS, UP FROM 23.8 BILLION IN 1986
 
CITYFED FINANCIAL CORP SAYS IT CUT QTRLY DIVIDEND TO ONE CENT FROM 10 CTS/SHR
 
PAINEWEBBER GROUP INC 3RD QTR SHARE 44 CTS VS 71 CTS
 
NEW YORK TIMES CO 3RD QTR SHR 40 CTS VS 33 CTS
 
BELLSOUTH CORP 3RD QTR SHR 87 CTS VS 84 CTS
 
FIRST BOSTON INC 3RD QTR SHR 1.15 DLRS VS 76 CTS
 
TRW INC 3RD QTR SHR 1.01 DLRS VS 66 CTS
 
CATERPILLAR INC SEES HIGHER FISCAL 1987 EARNINGS
 
CATERPILLAR INC 3RD QTR SHR 1.47 DLRS VS LOSS 26 CTS
 
FEDERAL-MOGUL CORP 3RD QTR SHR 66 CTS VS 48 CTS
 
DIME SAVINGS BANK OF NEW YORK 3RD QTR NET 27.8 MLN DLRS 28.6 MLN DLRS
 
AIR PR

In [60]:
for i in range(len(test_data)):
    if(result[4]['predict'][i] == 1):
        print(test_data[i])

MIDWEST GRAIN FUTURES 11:00   EDT MINNEAPOLIS WHEAT MAY7 284 3/4 UP 1 1/4 JUL7 280 1/4 OFF 1/2 SEP7 277 UP 1 DEC7 -- -- MAR8 -- -- 
MIDWEST GRAIN FUTURES 11:01   EDT MINNEAPOLIS WHEAT MAY7 284 3/4 UP 1 1/4 JUL7 280 1/4 OFF 1/2 SEP7 277 UP 1 DEC7 -- -- MAR8 -- -- 
CBT WHEAT FUTURES OPEN FIRMER, SET NEW HIGHS CBT wheat futures opened firmer to again set new contract highs in new crop, then back off those highs to hold gains of 1-1/4 cents per bushel in May, with new crop July unchanged in light early dealings. Steady speculative buying after yesterday's strong close kept the chart picture very bullish and supported initial values, traders said. Rumors that exporters are planning to ship SRW wheat out of Toledo and/or Chicago, further tightening already low deliverable stocks, kept May firm relative to new crop months, they added. However, the rally failed to follow-through due to the lack of confirmed export sales of significant quantities of U.S. wheat so far this week, as some trader