In [18]:
import pandas as pd

In [21]:
# Use Pandas to visualize the data
pd.read_csv("./baby-names.csv").head(3)

Unnamed: 0,year,name,prop,sex,soundex
0,1880,John,0.081541,boy,J500
1,1880,William,0.080511,boy,W450
2,1880,James,0.050057,boy,J520


> *For Big Data, we may have problems bringing in the data to the system. Here's where `Apache Spark` comes in.*

# Create new RDD 

In [1]:
# Create new RDD
baby_names = sc.textFile("./baby-names.csv")
baby_names

MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:-2

# Spark Transformation Concepts
Another new RDD created by calling transformation `map`

In [2]:
# Another new RDD created by calling transformation map
# Transform the baby_names RDD into rows RDD by applying function `map`.

rows = baby_names.map(lambda line: line.split(","))
rows.cache()

PythonRDD[2] at RDD at PythonRDD.scala:43

In [20]:
# Loop over the data and print the rows as => FirstName (1980)
for row in rows.take(rows.count()): 
    print("{}, ({})".format(row[1],row[0]))

"name", ("year")
"John", (1880)
"William", (1880)
"James", (1880)
"Charles", (1880)
"George", (1880)
"Frank", (1880)
"Joseph", (1880)
"Thomas", (1880)
"Henry", (1880)
"Robert", (1880)
"Edward", (1880)
"Harry", (1880)
"Walter", (1880)
"Arthur", (1880)
"Fred", (1880)
"Albert", (1880)
"Samuel", (1880)
"David", (1880)
"Louis", (1880)
"Joe", (1880)
"Charlie", (1880)
"Clarence", (1880)
"Richard", (1880)
"Andrew", (1880)
"Daniel", (1880)
"Ernest", (1880)
"Will", (1880)
"Jesse", (1880)
"Oscar", (1880)
"Lewis", (1880)
"Peter", (1880)
"Benjamin", (1880)
"Frederick", (1880)
"Willie", (1880)
"Alfred", (1880)
"Sam", (1880)
"Roy", (1880)
"Herbert", (1880)
"Jacob", (1880)
"Tom", (1880)
"Elmer", (1880)
"Carl", (1880)
"Lee", (1880)
"Howard", (1880)
"Martin", (1880)
"Michael", (1880)
"Bert", (1880)
"Herman", (1880)
"Jim", (1880)
"Francis", (1880)
"Harvey", (1880)
"Earl", (1880)
"Eugene", (1880)
"Ralph", (1880)
"Ed", (1880)
"Claude", (1880)
"Edwin", (1880)
"Ben", (1880)
"Charley", (1880)
"Paul", (1880)
"

# Actions

* `Actions` aggregates all the `RDD` elements. Sunch functions include `reduce()`, `collect()`
* It returns final result to the driver program

In [21]:
sc.parallelize([2, 3, 4]).flatMap(lambda x: [x,x,x]).collect()

[2, 2, 2, 3, 3, 3, 4, 4, 4]

In [22]:
sc.parallelize([2, 3, 4]).map(lambda x: [x,x,x]).collect()

[[2, 2, 2], [3, 3, 3], [4, 4, 4]]

## `Filter()`

In [23]:
rows.filter(lambda line: "Bernard" in line).collect()

[]

## Map Partitions
> Way to partition operations into the certain number of clusters for parallelize operations.

Example:
Create numbers 1 ~ 9. Distribute the numbers over three clusters (partitions) and sum each distribution.
* Partition 1: 1+2+3 = 6
* Partition 2: 4+5+6 = 15
* Partition 3: 7+8+9 = 24

In [26]:
# Create numbers 
one_thru_9 = range(1,10)

# Distribute the numbers over 3 partitions/clusters
parallel = sc.parallelize(one_thru_9, 3)

# Define a function that compute the addition over each cluster
def add_numbers(iterator):
    yield sum(iterator)
    
# Call the function over a parallel cluster
parallel.mapPartitions(add_numbers).collect()

[6, 15, 24]

In [32]:
# If we do not specify the number of clusters we have:
parallel = sc.parallelize(one_thru_9)

# We check how many partitions/clusters (4)
# 1+2 = 3,    3+4 = 7,   5+6 = 11,   7+8+9 = 24

parallel.mapPartitions(add_numbers).collect()


[3, 7, 11, 24]

In [34]:
# We have dual cores or 4 partitions
print(sc.defaultParallelism)

4


In [38]:
one = sc.parallelize(range(1,10))
print(one.collect())

two = sc.parallelize(range(10,21))
print(two.collect())

[1, 2, 3, 4, 5, 6, 7, 8, 9]
[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


In [40]:
# Find Union of both
one.union(two).collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

In [49]:
# Find intersection
print(one.collect())
three = sc.parallelize(range(5,15))
print(three.collect())

one.intersection(three).collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9]
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14]


[8, 9, 5, 6, 7]

In [50]:
# Find distinct values
one.union(three).distinct().collect()

[8, 1, 9, 2, 10, 11, 3, 12, 4, 5, 13, 14, 6, 7]

In [51]:
rows.collect()

[[u'"year"', u'"name"', u'"prop"', u'"sex"', u'"soundex"'],
 [u'1880', u'"John"', u'0.081541', u'"boy"', u'"J500"'],
 [u'1880', u'"William"', u'0.080511', u'"boy"', u'"W450"'],
 [u'1880', u'"James"', u'0.050057', u'"boy"', u'"J520"'],
 [u'1880', u'"Charles"', u'0.045167', u'"boy"', u'"C642"'],
 [u'1880', u'"George"', u'0.043292', u'"boy"', u'"G620"'],
 [u'1880', u'"Frank"', u'0.02738', u'"boy"', u'"F652"'],
 [u'1880', u'"Joseph"', u'0.022229', u'"boy"', u'"J210"'],
 [u'1880', u'"Thomas"', u'0.021401', u'"boy"', u'"T520"'],
 [u'1880', u'"Henry"', u'0.020641', u'"boy"', u'"H560"'],
 [u'1880', u'"Robert"', u'0.020404', u'"boy"', u'"R163"'],
 [u'1880', u'"Edward"', u'0.019965', u'"boy"', u'"E363"'],
 [u'1880', u'"Harry"', u'0.018175', u'"boy"', u'"H600"'],
 [u'1880', u'"Walter"', u'0.014822', u'"boy"', u'"W436"'],
 [u'1880', u'"Arthur"', u'0.013504', u'"boy"', u'"A636"'],
 [u'1880', u'"Fred"', u'0.013251', u'"boy"', u'"F630"'],
 [u'1880', u'"Albert"', u'0.012609', u'"boy"', u'"A416"'],
 [u