### Start SparkContext Instance

In [1]:
# Find path to PySpark.
import findspark
findspark.init()

# Import PySpark and initialize SparkContext object.
import pyspark
sc = pyspark.SparkContext()

### Import Raw Data

In [2]:
raw_hamlet = sc.textFile("hamlet.txt")
split_hamlet = raw_hamlet.map(lambda line: line.split('\t'))
split_hamlet.take(5)

[['hamlet@0', '', 'HAMLET'],
 ['hamlet@8'],
 ['hamlet@9'],
 ['hamlet@10', '', 'DRAMATIS PERSONAE'],
 ['hamlet@29']]

### Strip Prefixes

In [11]:
def format_id(x):
    id = x[0].split('@')[1]
    results = list()
    results.append(id)
    if len(x) > 1:
        for y in x[1:]:
            results.append(y)
    return results

hamlet_with_ids = split_hamlet.map(lambda line: format_id(line))
hamlet_with_ids.take(10)

[['0', '', 'HAMLET'],
 ['8'],
 ['9'],
 ['10', '', 'DRAMATIS PERSONAE'],
 ['29'],
 ['30'],
 ['31', 'CLAUDIUS', 'king of Denmark. (KING CLAUDIUS:)'],
 ['74'],
 ['75', 'HAMLET', 'son to the late, and nephew to the present king.'],
 ['131']]

### Remove Blanks and Pipes

In [19]:
def rm_blanks(x):
    if '|' in x:
        x.remove('|')
    if '' in x:
        x.remove('')
    if len(x)!=1:
        yield(x)
        
hamlet_text_only = hamlet_with_ids.flatMap(rm_blanks)
hamlet_text_only.take(10)

[['0', 'HAMLET'],
 ['10', 'DRAMATIS PERSONAE'],
 ['31', 'CLAUDIUS', 'king of Denmark. (KING CLAUDIUS:)'],
 ['75', 'HAMLET', 'son to the late, and nephew to the present king.'],
 ['132', 'POLONIUS', 'lord chamberlain. (LORD POLONIUS:)'],
 ['177', 'HORATIO', 'friend to Hamlet.'],
 ['204', 'LAERTES', 'son to Polonius.'],
 ['230', 'LUCIANUS', 'nephew to the king.'],
 ['261', 'VOLTIMAND'],
 ['276', 'CORNELIUS']]

In [16]:
# OR

In [21]:
hamlet_with_ids.take(5)
real_text = hamlet_with_ids.filter(lambda line: len(line) > 1)
hamlet_text_only = real_text.map(lambda line: [l for l in line if (l != '') and (l != '|')])
hamlet_text_only.take(100)

[['0', 'HAMLET'],
 ['10', 'DRAMATIS PERSONAE'],
 ['31', 'CLAUDIUS', 'king of Denmark. (KING CLAUDIUS:)'],
 ['75', 'HAMLET', 'son to the late, and nephew to the present king.'],
 ['132', 'POLONIUS', 'lord chamberlain. (LORD POLONIUS:)'],
 ['177', 'HORATIO', 'friend to Hamlet.'],
 ['204', 'LAERTES', 'son to Polonius.'],
 ['230', 'LUCIANUS', 'nephew to the king.'],
 ['261', 'VOLTIMAND'],
 ['273'],
 ['276', 'CORNELIUS'],
 ['288'],
 ['291', 'ROSENCRANTZ', '|  courtiers.'],
 ['317'],
 ['320', 'GUILDENSTERN'],
 ['335'],
 ['338', 'OSRIC'],
 ['348', 'A Gentleman, (Gentlemen:)'],
 ['376', 'A Priest. (First Priest:)'],
 ['405', 'MARCELLUS'],
 ['417', '|  officers.'],
 ['431', 'BERNARDO'],
 ['444', 'FRANCISCO', 'a soldier.'],
 ['466', 'REYNALDO', 'servant to Polonius.'],
 ['496', 'Players.'],
 ['506', '(First Player:)'],
 ['523', '(Player King:)'],
 ['539', '(Player Queen:)'],
 ['557', 'Two Clowns, grave-diggers.'],
 ['585', '(First Clown:)'],
 ['601', '(Second Clown:)'],
 ['619', 'FORTINBRAS', 'p