In [37]:
%pylab inline
import seaborn as sns

Populating the interactive namespace from numpy and matplotlib


# In this notebook, we'll explore a few methods in Spark Core
### Creating RDDs, collect(), map(), join(), reduceByKey(), aggregateByKey(), and filter()

## start with an example

### We'll start with an online-dating dataset, described here: https://sites.google.com/a/insightdatascience.com/spark-lab/s3-data/dating-profiles

### Here we create an RDD from a csv stored in s3, and use the collect() action, which returns an array


In [1]:
#a csv of id, users
rawUsersRDD = sc.textFile("s3n://insight-spark-after-dark/users-sm.csv")
rawUsersRDD.collect()

[u'10001,Tony',
 u'10002,Mike',
 u'10003,Pat',
 u'10004,Chris',
 u'10005,Paco',
 u'10006,Eddie',
 u'90001,Lisa',
 u'90002,Cindy',
 u'90003,Paula',
 u'90004,Leslie',
 u'90005,Allman',
 u'90006,Kimberly']

### The collect action causes data to flow across the network from the worker nodes to the master (where you are running the jupyter notebook, or your data analysis)


In [2]:
#a csv of id, genders
rawGendersRDD = sc.textFile("s3n://insight-spark-after-dark/gender-sm.csv")
rawGendersRDD.collect()

[u'10001,M',
 u'10002,M',
 u'10003,M',
 u'10004,M',
 u'10005,M',
 u'10006,M',
 u'90001,F',
 u'90002,F',
 u'90003,F',
 u'90004,F',
 u'90005,F',
 u'90006,F']

In [3]:
def rec_tup(record):
    tokens = record.split(",")
    return (int(tokens[0]), str(tokens[1]))

### Moving toward a join action, we can use the map() method to create Key/Value pairs from the tuple

In [4]:
usersRDD = rawUsersRDD.map(rec_tup)
usersRDD.collect()

[(10001, 'Tony'),
 (10002, 'Mike'),
 (10003, 'Pat'),
 (10004, 'Chris'),
 (10005, 'Paco'),
 (10006, 'Eddie'),
 (90001, 'Lisa'),
 (90002, 'Cindy'),
 (90003, 'Paula'),
 (90004, 'Leslie'),
 (90005, 'Allman'),
 (90006, 'Kimberly')]

In [5]:
gendersRDD = rawGendersRDD.map(rec_tup)
gendersRDD.collect()

[(10001, 'M'),
 (10002, 'M'),
 (10003, 'M'),
 (10004, 'M'),
 (10005, 'M'),
 (10006, 'M'),
 (90001, 'F'),
 (90002, 'F'),
 (90003, 'F'),
 (90004, 'F'),
 (90005, 'F'),
 (90006, 'F')]

### Now that we have two RDDs with Key/Value pairs, use the join method to join the RDDs based on the Key

In [6]:
usersWithGenderJoinedRDD = usersRDD.join(gendersRDD)
usersWithGenderJoinedRDD.collect()

[(10004, ('Chris', 'M')),
 (90004, ('Leslie', 'F')),
 (90005, ('Allman', 'F')),
 (90001, ('Lisa', 'F')),
 (10005, ('Paco', 'M')),
 (10001, ('Tony', 'M')),
 (90006, ('Kimberly', 'F')),
 (10002, ('Mike', 'M')),
 (90002, ('Cindy', 'F')),
 (10006, ('Eddie', 'M')),
 (10003, ('Pat', 'M')),
 (90003, ('Paula', 'F'))]

# Next Steps

Here are some further questions to get to grips with Spark basics

### Task 1: There exists another zipped csv, s3n://insight-spark-after-dark/gender.csv.gz. Create an RDD called genders_raw

In [13]:
genders_raw = sc.textFile("s3n://insight-spark-after-dark/gender.csv.gz")
genders_raw.take(5)

[u'1,F', u'2,F', u'3,U', u'4,F', u'5,F']

### Task 2: Parse each of the record  from the genderRDDD so that we have a tuple of (gender(string), id(int)) and show 


In [51]:
genders = genders_raw.map(rec_tup)
genders.take(5)

[(1, 'F'), (2, 'F'), (3, 'U'), (4, 'F'), (5, 'F')]

### Task 3: Calculate the distribution of Males, Females, and Unknowns and show the results


Nahiyan's solution. key=gender, value=True, then will sum

In [32]:
genderBool = lambda x: (x[1], 1)

In [33]:
[(g[0], sum(g[1]))
 for g in genders.map(genderBool)\
                .groupByKey().collect()]

[('U', 83164), ('M', 76441), ('F', 61365)]

Nahiyan used reduceByKey - smart

In [43]:
counts = genders.map(genderBool)\
        .reduceByKey(lambda a,b: a+b)\
        .collect()

d_counts = {k:v for k,v in counts}
d_counts

{'F': 61365, 'M': 76441, 'U': 83164}

### Task 4: Calculate the sum of all the IDs for each gender


In [45]:
swap = lambda x: (x[1], x[0])

sums = genders.map(swap).reduceByKey(lambda a,b: a+b).collect()
d_sums = {k:v for k,v in sums}
d_sums

{'F': 4325473098, 'M': 6982853010, 'U': 13105654827}

### Task 5: Calculate the average of all the IDs for each gender


In [47]:
avgs = {}
for gender, count in d_counts.items():
    avgs[gender] = d_sums[gender] / count

avgs

{'F': 70487, 'M': 91349, 'U': 157588}

### Task 6: Only return records from parsed_rdd which have ids that are a multiple of 5 and show the first 5 results


In [53]:
genders.filter(lambda x: (x[0] % 5)==0).collect()

[(5, 'F'),
 (10, 'M'),
 (15, 'U'),
 (20, 'M'),
 (25, 'F'),
 (30, 'M'),
 (35, 'F'),
 (40, 'F'),
 (45, 'M'),
 (50, 'F'),
 (55, 'F'),
 (60, 'F'),
 (65, 'F'),
 (70, 'M'),
 (75, 'F'),
 (80, 'M'),
 (85, 'M'),
 (90, 'F'),
 (95, 'M'),
 (100, 'F'),
 (105, 'F'),
 (110, 'F'),
 (115, 'F'),
 (120, 'M'),
 (125, 'U'),
 (130, 'M'),
 (135, 'F'),
 (140, 'F'),
 (145, 'F'),
 (150, 'M'),
 (155, 'U'),
 (160, 'F'),
 (165, 'F'),
 (170, 'M'),
 (175, 'M'),
 (180, 'F'),
 (185, 'F'),
 (190, 'F'),
 (195, 'F'),
 (200, 'M'),
 (205, 'M'),
 (210, 'F'),
 (215, 'F'),
 (220, 'F'),
 (225, 'M'),
 (230, 'M'),
 (235, 'U'),
 (240, 'M'),
 (245, 'M'),
 (250, 'M'),
 (255, 'M'),
 (260, 'F'),
 (265, 'M'),
 (270, 'M'),
 (275, 'F'),
 (280, 'F'),
 (285, 'M'),
 (290, 'F'),
 (295, 'M'),
 (300, 'M'),
 (305, 'U'),
 (310, 'F'),
 (315, 'M'),
 (320, 'F'),
 (325, 'M'),
 (330, 'F'),
 (335, 'M'),
 (340, 'F'),
 (345, 'M'),
 (350, 'U'),
 (355, 'M'),
 (360, 'M'),
 (365, 'M'),
 (370, 'F'),
 (375, 'M'),
 (380, 'F'),
 (385, 'F'),
 (390, 'M'),
 (395,

what happens if I forgot that columns were swapped?

In [54]:
genders.filter(lambda x: (x[1] % 5)==0).collect()

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 48.0 failed 4 times, most recent failure: Lost task 0.3 in stage 48.0 (TID 64, 172.31.33.144): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "<ipython-input-54-49e2b6ef7599>", line 1, in <lambda>
TypeError: not all arguments converted during string formatting

	at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:138)
	at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:179)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:97)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
	at org.apache.spark.scheduler.Task.run(Task.scala:70)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
	at scala.Option.foreach(Option.scala:236)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
