### Instantiate SparkContext

In [1]:
# Find path to PySpark.
import findspark
findspark.init()

# Import PySpark and initialize SparkContext object.
import pyspark
sc = pyspark.SparkContext()

### Reading in .txt to RDD

In [3]:
raw_hamlet = sc.textFile("hamlet.txt")
raw_hamlet.take(5)

['hamlet@0\t\tHAMLET',
 'hamlet@8',
 'hamlet@9',
 'hamlet@10\t\tDRAMATIS PERSONAE',
 'hamlet@29']

### Splitting on Delimiter into Columns

In [4]:
split_hamlet = raw_hamlet.map(lambda line: line.split('\t'))
split_hamlet.take(5)

[['hamlet@0', '', 'HAMLET'],
 ['hamlet@8'],
 ['hamlet@9'],
 ['hamlet@10', '', 'DRAMATIS PERSONAE'],
 ['hamlet@29']]

### flatMap()
* flatMap() is doesn't require an output for every single row in the RDD.
* The flatMap() method is useful whenever we want to generate a custom sequence of values from an RDD.
* function passed into flatMap need not always return a value

In [5]:
def hamlet_speaks(line):
    id = line[0]
    speaketh = False
    
    if "HAMLET" in line:
        speaketh = True
    
    if speaketh:
        # Function is not expected to return an output for each input, so use yield not return
        yield id,"hamlet speaketh!"

hamlet_spoken = split_hamlet.flatMap(lambda x: hamlet_speaks(x))
hamlet_spoken.take(10)

[('hamlet@0', 'hamlet speaketh!'),
 ('hamlet@75', 'hamlet speaketh!'),
 ('hamlet@1004', 'hamlet speaketh!'),
 ('hamlet@9144', 'hamlet speaketh!'),
 ('hamlet@12313', 'hamlet speaketh!'),
 ('hamlet@12434', 'hamlet speaketh!'),
 ('hamlet@12760', 'hamlet speaketh!'),
 ('hamlet@12858', 'hamlet speaketh!'),
 ('hamlet@14821', 'hamlet speaketh!'),
 ('hamlet@15261', 'hamlet speaketh!')]

### filter()
* function passed into filter __must__ always return a value either `True` or `False`

In [7]:
def filter_hamlet_speaks(line):
    if "HAMLET" in line:
        return True
    return False

hamlet_spoken_lines = split_hamlet.filter(lambda line: filter_hamlet_speaks(line))
hamlet_spoken_lines.take(5)

[['hamlet@0', '', 'HAMLET'],
 ['hamlet@75', 'HAMLET', 'son to the late, and nephew to the present king.'],
 ['hamlet@1004', '', 'HAMLET'],
 ['hamlet@9144', '', 'HAMLET'],
 ['hamlet@12313',
  'HAMLET',
  '[Aside]  A little more than kin, and less than kind.']]

### count()

In [9]:
spoken_count = hamlet_spoken_lines.count()
print(spoken_count)

381


### collect()
* Get the entire RDD

In [10]:
spoken_all = hamlet_spoken_lines.collect()

In [12]:
type(spoken_all)

list

In [16]:
spoken_all[29:39] # 30th to 40th lines of the RDD 

[['hamlet@20104', 'HAMLET', 'Pale or red?'],
 ['hamlet@20150', 'HAMLET', "                  And fix'd his eyes upon you?"],
 ['hamlet@20231', 'HAMLET', '                  I would I had been there.'],
 ['hamlet@20323', 'HAMLET', "Very like, very like. Stay'd it long?"],
 ['hamlet@20502', 'HAMLET', '                  His beard was grizzled--no?'],
 ['hamlet@20623', 'HAMLET', '                  I will watch to-night;'],
 ['hamlet@20730', 'HAMLET', "If it assume my noble father's person,"],
 ['hamlet@21197', 'HAMLET', 'Your loves, as mine to you: farewell.'],
 ['hamlet@21486', '', 'HAMLET'],
 ['hamlet@27782', '', 'HAMLET']]