### Parse data and create RDD

In [1]:
from pyspark import SparkConf, SparkContext
sc = SparkContext(conf=SparkConf().setAppName("MyApp").setMaster("local"))

import re

def parse_article(line):
    try:
        article_id, text = unicode(line.rstrip()).split('\t', 1)
        text = re.sub("^\W+|\W+$", "", text, flags=re.UNICODE)
        words = re.split("\W*\s+\W*", text, flags=re.UNICODE)
        return [word.lower() for word in words]
    except ValueError as e:
        return []

wiki = sc.textFile("/data/wiki/en_articles_part/articles-part", 16).map(parse_article)

In [3]:
wiki.take(1)[0][:5]

[u'anarchism', u'anarchism', u'is', u'often', u'defined']

### Filter out stopwords

In [4]:
! head /datasets/stop_words_en.txt

a
about
above
across
after
afterwards
again
against
all
almost


In [5]:
## load dictionary
stopWords_data = sc.textFile('/datasets/stop_words_en.txt').collect()

In [6]:
stopWords_broadcast = sc.broadcast(stopWords_data)

In [8]:
wiki_filtered = wiki.map(lambda r: [word for word in r 
                                    if word not in stopWords_broadcast.value])

In [12]:
wiki_filtered.take(1)[0][:5]

[u'anarchism', u'anarchism', u'defined', u'political', u'philosophy']

### Compute Word Counts

In [22]:
wiki_words = wiki_filtered.flatMap(lambda r: r)

In [25]:
word_counts = wiki_words.map(lambda pair: (pair, 1)).reduceByKey(lambda x, y: x + y)

In [26]:
word_counts.take(3)

[(u'biennials', 10), (u'underlyingly', 1), (u'ancyra', 43)]

### Compute all bigrams - pairs of consequent words

In [13]:
wiki_bigrams = wiki_filtered.flatMap(lambda r: [r[i] + '_' + r[i+1]
                                                for i in range(len(r)-1)])

In [14]:
wiki_bigrams.take(5)

[u'anarchism_anarchism',
 u'anarchism_defined',
 u'defined_political',
 u'political_philosophy',
 u'philosophy_holds']

### Count bigrams and filter out those with less than 500 occurrences

In [15]:
wiki_bigrams_counts = wiki_bigrams.map(lambda pair: (pair, 1)).reduceByKey(lambda x, y: x + y)

In [16]:
wiki_bigrams_counts.take(10)

[(u'2,000_1.5', 1),
 (u'fastest_mode', 1),
 (u'cases_federal', 4),
 (u'creem_particular', 1),
 (u'subgroups_lie', 2),
 (u'defendiendo_chile', 1),
 (u'vol_62', 4),
 (u'initial_production', 7),
 (u'buffalo_niagaras', 1),
 (u'flames_bas-reliefs', 1)]

In [17]:
bigrams_filtered = wiki_bigrams_counts.filter(lambda x: x[1] >= 500)

In [18]:
bigrams_filtered.take(10)

[(u'york_city', 625),
 (u'notes_references', 638),
 (u'north_american', 510),
 (u'new_york', 4193),
 (u'roman_catholic', 630),
 (u'award_best', 570),
 (u'american_singer-songwriter', 549),
 (u'catholic_church', 582),
 (u'world_war', 2335),
 (u'prime_minister', 1051)]

In [21]:
'notes_references' in bigrams_filtered.keys().collect()

True