In [27]:
#import required libraries
from pyspark import SparkContext

Create a context and apply flatMap() to convert each word from the text file to an RDD.

In [28]:
sc = SparkContext.getOrCreate();
words = sc.textFile("wc_input.txt").flatMap(lambda line: line.split(" "))
words.collect()

['PySpark',
 'is',
 'the',
 'python',
 'binding',
 'for',
 'the',
 'Spark',
 'Platform',
 'and',
 'API',
 'and',
 'not',
 'much',
 'different',
 'from',
 'the',
 'Java/Scala',
 'versions.',
 'A',
 'good',
 'starting',
 'point',
 'is',
 'the',
 'official',
 'page',
 'i.e',
 'Examples',
 '|',
 'Apache',
 'Spark.',
 'Python',
 'is',
 'dynamically',
 'typed,',
 'so',
 'RDDs',
 'can',
 'hold',
 'objects',
 'of',
 'multiple',
 'types.',
 'PySpark',
 'does',
 'not',
 'yet',
 'support',
 'a',
 'few',
 'API',
 'calls,',
 'such',
 'as',
 'lookup',
 'and',
 'non-text',
 'input',
 'files,',
 'though',
 'these',
 'will',
 'be',
 'added',
 'in',
 'future',
 'releases.']

Apply map transformation on the words to create new RDDs where each word is given a count of 1.

In [29]:
wordCounts = words.map(lambda word: (word, 1))
wordCounts.collect()

[('PySpark', 1),
 ('is', 1),
 ('the', 1),
 ('python', 1),
 ('binding', 1),
 ('for', 1),
 ('the', 1),
 ('Spark', 1),
 ('Platform', 1),
 ('and', 1),
 ('API', 1),
 ('and', 1),
 ('not', 1),
 ('much', 1),
 ('different', 1),
 ('from', 1),
 ('the', 1),
 ('Java/Scala', 1),
 ('versions.', 1),
 ('A', 1),
 ('good', 1),
 ('starting', 1),
 ('point', 1),
 ('is', 1),
 ('the', 1),
 ('official', 1),
 ('page', 1),
 ('i.e', 1),
 ('Examples', 1),
 ('|', 1),
 ('Apache', 1),
 ('Spark.', 1),
 ('Python', 1),
 ('is', 1),
 ('dynamically', 1),
 ('typed,', 1),
 ('so', 1),
 ('RDDs', 1),
 ('can', 1),
 ('hold', 1),
 ('objects', 1),
 ('of', 1),
 ('multiple', 1),
 ('types.', 1),
 ('PySpark', 1),
 ('does', 1),
 ('not', 1),
 ('yet', 1),
 ('support', 1),
 ('a', 1),
 ('few', 1),
 ('API', 1),
 ('calls,', 1),
 ('such', 1),
 ('as', 1),
 ('lookup', 1),
 ('and', 1),
 ('non-text', 1),
 ('input', 1),
 ('files,', 1),
 ('though', 1),
 ('these', 1),
 ('will', 1),
 ('be', 1),
 ('added', 1),
 ('in', 1),
 ('future', 1),
 ('releases.',

reduceByKey() will perform the merging locally on each mapper before sending results to a reducer, similarly to a “combiner” in MapReduce. The output will be the total count for each unique word. 

In [30]:
wordCounts = wordCounts.reduceByKey(lambda a,b:a +b)
wordCounts.collect()

[('PySpark', 2),
 ('is', 3),
 ('python', 1),
 ('binding', 1),
 ('Spark', 1),
 ('Platform', 1),
 ('different', 1),
 ('Java/Scala', 1),
 ('good', 1),
 ('starting', 1),
 ('point', 1),
 ('official', 1),
 ('page', 1),
 ('|', 1),
 ('Apache', 1),
 ('Spark.', 1),
 ('Python', 1),
 ('dynamically', 1),
 ('of', 1),
 ('multiple', 1),
 ('yet', 1),
 ('support', 1),
 ('as', 1),
 ('input', 1),
 ('files,', 1),
 ('though', 1),
 ('these', 1),
 ('added', 1),
 ('in', 1),
 ('the', 4),
 ('for', 1),
 ('and', 3),
 ('API', 2),
 ('not', 2),
 ('much', 1),
 ('from', 1),
 ('versions.', 1),
 ('A', 1),
 ('i.e', 1),
 ('Examples', 1),
 ('typed,', 1),
 ('so', 1),
 ('RDDs', 1),
 ('can', 1),
 ('hold', 1),
 ('objects', 1),
 ('types.', 1),
 ('does', 1),
 ('a', 1),
 ('few', 1),
 ('calls,', 1),
 ('such', 1),
 ('lookup', 1),
 ('non-text', 1),
 ('will', 1),
 ('be', 1),
 ('future', 1),
 ('releases.', 1)]