In [1]:
with open('./NewsAggregatorDataset/newsCorpora.csv') as f:
    data = f.read()
    
data = data.splitlines()
data = [line.split('\t') for line in data]

In [2]:
data[:2]

[['1',
  'Fed official says weak data caused by weather, should not slow taper',
  'http://www.latimes.com/business/money/la-fi-mo-federal-reserve-plosser-stimulus-economy-20140310,0,1312750.story\\?track=rss',
  'Los Angeles Times',
  'b',
  'ddUyU0VZz0BRneMioxUPQVP6sIxvM',
  'www.latimes.com',
  '1394470370698'],
 ['2',
  "Fed's Charles Plosser sees high bar for change in pace of tapering",
  'http://www.livemint.com/Politics/H2EvwJSK2VE6OF7iK1g3PP/Feds-Charles-Plosser-sees-high-bar-for-change-in-pace-of-ta.html',
  'Livemint',
  'b',
  'ddUyU0VZz0BRneMioxUPQVP6sIxvM',
  'www.livemint.com',
  '1394470371207']]

50

In [3]:
import random
random.seed(0)

In [4]:
publisher = [
    "Reuters",
    "Huffington Post",
    "Businessweek", 
    "Contactmusic.com", 
    "Daily Mail"
]

article = [
    article for article in data
    if article[3] in publisher
]
article[10]

['64',
 "EBay CEO Donahoe's Pay Drops by 53% to $13.8 Million for 2013",
 'http://www.businessweek.com/news/2014-03-10/ebay-ceo-donahoe-s-pay-drops-by-53-percent-to-13-dot-8-million-for-2013',
 'Businessweek',
 'b',
 'dxyGGb4iN9Cs9aMZTKQpJeoiQfruM',
 'www.businessweek.com',
 '1394470923237']

In [5]:
random.shuffle(article)
article[10]

['276048',
 'FOREX- Euro pinned near four-month lows as ECB looms',
 'http://in.reuters.com/article/2014/06/04/markets-forex-idINL3N0OL4OA20140604',
 'Reuters',
 'b',
 'd84jnxHjQNm9acMpm2areNmDtrR_M',
 'in.reuters.com',
 '1401950103350']

In [6]:
article = [[a[4], a[1]] for a in article]

In [7]:
n = len(article)
train_end = int(n * 0.8)
valid_end = int(n * 0.9)
train = article[:train_end]
valid = article[train_end:valid_end]
test = article[valid_end:]
len(train), len(valid), len(test)

(10684, 1336, 1336)

In [8]:
def write_txt(filename, data):
    with open('./{}.txt'.format(filename), 'w') as f:
        for a in data:
            f.writelines("\t".join(a)+'\n')

write_txt("train", train)
write_txt("valid", valid)
write_txt("test", test)
    

In [9]:
from collections import Counter
from tabulate import tabulate

In [10]:
categories = ['b', 't', 'e', 'm']
category_names = ['business', 'science and technology', 'entertainment', 'health']

In [11]:
table = [
    [name] + [freqs[cat] for cat in categories ]
    for name, freqs in [
        ('train',  Counter([cat for cat, _ in train])),
        ('valid',  Counter([cat for cat, _ in valid])),
        ('test',  Counter([cat for cat, _ in test]))
    ]
]
print(tabulate(table, headers=categories))

          b     t     e    m
-----  ----  ----  ----  ---
train  4557  1203  4180  744
valid   543   159   542   92
test    527   163   572   74


51

In [12]:
import re
import spacy
import nltk

In [13]:
nlp = spacy.load('en')
stemmer = nltk.stem.snowball.SnowballStemmer(language='english')

In [14]:
def tokenize(x):
    x = re.sub(r'\s+', ' ', x)
    x = nlp.make_doc(x) # nlp(x)は遅い tokenizer以外も走るので
    x = [stemmer.stem(doc.lemma_.lower()) for doc in x]
    return x

In [15]:
tokenized_train = [[cat, tokenize(line)] for cat, line in train]
tokenized_valid = [[cat, tokenize(line)] for cat, line in valid]

In [16]:
tokenized_test = [[cat, tokenize(line)] for cat, line in test[:5]]

In [17]:
counter = Counter([
    token
    for _, tokens in tokenized_train
    for token in tokens
])
counter.most_common()

[('-', 3318),
 ("'", 3094),
 ('to', 2840),
 ('have', 2158),
 (',', 2125),
 ('...', 2036),
 ('in', 1916),
 ('a', 1876),
 ('the', 1558),
 ('of', 1466),
 ('for', 1365),
 ('on', 1359),
 ('updat', 1095),
 (':', 1076),
 ('and', 949),
 ('us', 911),
 ('be', 791),
 ('with', 750),
 ('at', 683),
 ('after', 591),
 ('new', 538),
 ('$', 502),
 ('"', 500),
 ('say', 446),
 ('stock', 444),
 ('(', 419),
 (')', 411),
 ('not', 385),
 ('up', 378),
 ('is', 376),
 ('from', 375),
 ('by', 345),
 ('?', 325),
 ('kardashian', 295),
 ('rise', 287),
 (';', 286),
 ('it', 271),
 ('china', 266),
 ('over', 265),
 ('share', 263),
 ('her', 262),
 ('high', 259),
 ('will', 258),
 ('kim', 251),
 ('euro', 248),
 ('show', 237),
 ('fall', 235),
 ('low', 232),
 ('1', 232),
 ('bank', 221),
 ('about', 218),
 ('2', 217),
 ('-pron-', 217),
 ('rate', 215),
 ('market', 212),
 ('year', 208),
 ('sale', 207),
 ('deal', 207),
 ('!', 205),
 ('can', 203),
 ('fed', 202),
 ('star', 200),
 ('dollar', 197),
 ('may', 196),
 ('day', 193),
 ('tha

In [18]:
vocab = [
    token
    for token, freq in counter.most_common()
    if 2 < freq < 300
]
vocab

['kardashian',
 'rise',
 ';',
 'it',
 'china',
 'over',
 'share',
 'her',
 'high',
 'will',
 'kim',
 'euro',
 'show',
 'fall',
 'low',
 '1',
 'bank',
 'about',
 '2',
 '-pron-',
 'rate',
 'market',
 'year',
 'sale',
 'deal',
 '!',
 'can',
 'fed',
 'star',
 'dollar',
 'may',
 'day',
 'that',
 'out',
 'ecb',
 'profit',
 'buy',
 'time',
 'drop',
 'get',
 'report',
 'wall',
 'see',
 'make',
 'global',
 'you',
 'take',
 'forex',
 'off',
 'billion',
 'gain',
 'but',
 'datum',
 'reveal',
 'near',
 'set',
 'she',
 'first',
 'bond',
 'much',
 'his',
 'week',
 'top',
 'miley',
 'cyrus',
 'ceo',
 'cut',
 'hit',
 'video',
 'million',
 'west',
 'price',
 'st',
 'open',
 'month',
 'record',
 'pay',
 'bln',
 'growth',
 'plan',
 'chris',
 'ukrain',
 'do',
 'down',
 'than',
 'end',
 'game',
 'more',
 'wed',
 'talk',
 'one',
 'home',
 'kany',
 'as',
 'rais',
 'befor',
 'he',
 'this',
 'googl',
 'oil',
 'all',
 'no',
 '.',
 'justin',
 'bieber',
 'inflat',
 'are',
 'who',
 'gold',
 'film',
 'into',
 'movi'

In [19]:
bi_grams = Counter([
        bi_gram
        for _, sent in tokenized_train
        for bi_gram in zip(sent, sent[1:])
    ]).most_common()
bi_grams = [tup for tup, freq in bi_grams if freq > 4]
bi_grams

[('us', 'stock'),
 ('kim', 'kardashian'),
 ('(', '1'),
 ('1', ')'),
 ('stock', '-'),
 ('forex', '-'),
 ('-pron-', 'have'),
 ('of', 'the'),
 ('miley', 'cyrus'),
 ('wall', 'st'),
 ('-', 'wall'),
 ('to', 'buy'),
 ("'", 'the'),
 ('justin', 'bieber'),
 ('kany', 'west'),
 ('have', "'"),
 ("'", ':'),
 ('new', 'york'),
 ('rpt', '-'),
 ('global', 'market'),
 ('in', 'the'),
 ('to', 'be'),
 ('market', '-'),
 ('-', 'dollar'),
 ('correct', '-'),
 ('ahead', 'of'),
 ('for', '$'),
 ('a', 'she'),
 ('game', 'of'),
 (',', 'but'),
 ('do', 'not'),
 ('of', 'throne'),
 ('on', 'the'),
 ('2', ')'),
 ('(', '2'),
 ('lindsay', 'lohan'),
 (':', "'"),
 ('in', 'new'),
 ('at', 'the'),
 ('in', '...'),
 ('selena', 'gomez'),
 ('a', '...'),
 ('the', '...'),
 ('set', 'to'),
 ('star', 'war'),
 ('at', "'"),
 ('precious', '-'),
 ('-', 'updat'),
 ('be', 'not'),
 ('-', 'euro'),
 ('s&p', '500'),
 ('plan', 'to'),
 ('a', "'"),
 (':', 'the'),
 ('in', 'a'),
 ('in', 'us'),
 ('will', 'not'),
 ('for', 'the'),
 ('kardashian', 'and'),
 

In [20]:
with open('vocab_for_news.txt', 'w') as f:
    for token in vocab:
        print(token, file = f)

In [21]:
with open('bi_grams_for_news.txt', 'w') as f:
    for tup in bi_grams:
        print(' '.join(tup), file = f)