In [1]:
from pyspark import SparkConf, SparkContext
sc = SparkContext(conf=SparkConf().setAppName("MyApp").setMaster("local"))

import re

#standard function which is parsing lines into words

def parse_article(line):
    try:
        article_id, text = unicode(line.rstrip()).split('\t', 1)
        text = re.sub("^\W+|\W+$", "", text, flags=re.UNICODE)
        words = re.split("\W*\s+\W*", text, flags=re.UNICODE)
        return words
    except ValueError as e:
        return []

#instead of stroing data by arcticles, I will create RDD with all words separate
wiki_by_word = sc.textFile("/data/wiki/en_articles_part/articles-part", 16).flatMap(parse_article).map(lambda word: word.lower()).cache()

#creating index for all words
wiki_with_index = wiki_by_word.zipWithIndex().cache()

#filtering only those records where the word we need is coming
filtered = wiki_with_index.filter(lambda (key,index) : key == u'narodnaya').map(lambda (key, index) : (index, key)).cache()

#saving list of indexes of the next word after narodnaya for next filter operation
ind = filtered.map(lambda (key,index): key + 1).collect()

#now let's get the net word but with reduced index in order to allow join
next_word = wiki_with_index.filter(lambda (key,index) : index in ind).map(lambda (key, index) : (index - 1, key)).cache()

#joining 2 datasets: with narodnaya and with next word
joined = filtered.join(next_word).cache()

#transforming the result into concat + preparing for the next grouping
joined_concat = joined.map(lambda (key, (val1, val2)): (val1 + '_' + val2, 1)).cache()

#creating result dataset with calculated number of appeareance of the pair, but sort it into lexagraphical order
result = joined_concat.reduceByKey(lambda a, b: a + b).sortByKey().cache()

In [2]:
#saving result dataset and its count into separate data structure and print the result

result_count = int(result.count())
result_set = result.collect()

for i in range (0,result_count):
    print "%s\t%d" % (result_set[i][0], int(result_set[i][1]))

narodnaya_gazeta	1
narodnaya_volya	9
