### Parse data and create raw RDD

In [1]:
from pyspark import SparkConf, SparkContext
sc = SparkContext(conf=SparkConf().setAppName("MyApp").setMaster("local"))

import re

def parse_article(line):
    try:
        article_id, text = unicode(line.rstrip()).split('\t', 1)
        text = re.sub("^\W+|\W+$", "", text, flags=re.UNICODE)
        words = re.split("\W*\s+\W*", text, flags=re.UNICODE)
        return words
    except ValueError as e:
        return []

wiki = sc.textFile("/data/wiki/en_articles_part/articles-part", 16).map(parse_article)


In [4]:
wiki.take(1)[0][:5]

[u'Anarchism', u'Anarchism', u'is', u'often', u'defined']

### Build word pairs

In [2]:
wiki_pairs = wiki.flatMap(lambda r: [r[i].lower()+'_'+r[i+1].lower() for i in range(len(r)-1)])

In [3]:
wiki_pairs.take(5)

[u'anarchism_anarchism',
 u'anarchism_is',
 u'is_often',
 u'often_defined',
 u'defined_as']

### Count each pair

In [5]:
wiki_pairs_counts = wiki_pairs.map(lambda pair: (pair, 1)).reduceByKey(lambda x, y: x + y)

In [6]:
wiki_pairs_counts.take(10)

[(u'standard_6-bit', 2),
 (u'fastest_mode', 1),
 (u'monks_assaulted', 1),
 (u'nabla_\\mu_is', 1),
 (u'flag_from', 2),
 (u'singer-songwriter_judas', 1),
 (u'camp_often', 2),
 (u'the_pandorica', 2),
 (u'around_mid-draw', 1),
 (u"strait's_asiatic", 1)]

### Find word pairs starting with required word 'narodnaya'

In [7]:
results = wiki_pairs_counts.filter(lambda count: count[0][:len('narodnaya')] == 'narodnaya').collect()

In [9]:
for pair in results:
    print '%s\t%s' % pair

narodnaya_gazeta	1
narodnaya_volya	9
