In [1]:
from pyspark import SparkConf, SparkContext
sc = SparkContext(conf=SparkConf().setAppName("MyApp").setMaster("local"))

import re

def parse_article(line):
    try:
        article_id, text = unicode(line.rstrip()).split('\t', 1)
        text = re.sub("^\W+|\W+$", "", text, flags=re.UNICODE)
        words = re.split("\W*\s+\W*", text, flags=re.UNICODE)
        return words
    except ValueError as e:
        return []

wiki = sc.textFile("/data/wiki/en_articles_part/articles-part", 16).map(parse_article)

Footnote: `sc.textFile(String path, int minPartitions)`

In [12]:
result = wiki.take(1)[0]
print(result)

[u'Anarchism', u'Anarchism', u'is', u'often', u'defined', u'as', u'a', u'political', u'philosophy', u'which', u'holds', u'the', u'state', u'to', u'be', u'undesirable', u'unnecessary', u'or', u'harmful', u'The', u'following', u'sources', u'cite', u'anarchism', u'as', u'a', u'political', u'philosophy', u'Slevin', u'Carl', u'Anarchism', u'The', u'Concise', u'Oxford', u'Dictionary', u'of', u'Politics', u'Ed', u'Iain', u'McLean', u'and', u'Alistair', u'McMillan', u'Oxford', u'University', u'Press', u'2003', u'However', u'others', u'argue', u'that', u'while', u'anti-statism', u'is', u'central', u'it', u'is', u'inadequate', u'to', u'define', u'anarchism', u'Therefore', u'they', u'argue', u'instead', u'that', u'anarchism', u'entails', u'opposing', u'authority', u'or', u'hierarchical', u'organization', u'in', u'the', u'conduct', u'of', u'human', u'relations', u'including', u'but', u'not', u'only', u'the', u'state', u'system', u'Proponents', u'of', u'anarchism', u'known', u'as', u'anarchists', u

Footnote:
1. the return array is a list of lists.
2. when use `take(2)`, the return array will be a list of 2 lists.

In [13]:
# convert every word to lower case
wiki = wiki.map(lambda words: [word.lower() for word in words])

In [14]:
def parse_word_pair_with_start_word(words, start_word='word'):
    pairs = []
    for i, word in enumerate(words[:-1]):
        if word == start_word:
            word_pair = '{}_{}'.format(word,words[i+1])
            pairs.append((word_pair, 1))
        else:
            continue
    return pairs

In [15]:
# pick up every word pair with starting word of 'narodnaya'
# the flatMap will combine all word pair together
wiki_word_pair = wiki.flatMap(lambda words: parse_word_pair_with_start_word(words, start_word='narodnaya'))

In [21]:
# have a look at the result dataset
wiki_word_pair.take(3)

[('narodnaya_volya', 1), ('narodnaya_volya', 1), ('narodnaya_volya', 1)]

In [22]:
# create a key RDD and aggregate all count
wiki_word_pair_agg = wiki_word_pair.reduceByKey(lambda a, b: a+b)

# sort by key
wiki_word_pair_agg_sort = wiki_word_pair_agg.sortByKey()

In [24]:
final_result = wiki_word_pair_agg_sort.collect()
for word_pair, count in final_result:
    print '{}\t{}'.format(word_pair, count)

narodnaya_gazeta	1
narodnaya_volya	9
