In [2]:
sc

In [4]:
rdd0 = sc.textFile("Popular_Baby_Names.csv").cache()
list(enumerate(rdd0.first().split(',')))

[(0, 'Year of Birth'),
 (1, 'Gender'),
 (2, 'Ethnicity'),
 (3, "Child's First Name"),
 (4, 'Count'),
 (5, 'Rank')]

### 1) Map and FlatMap

In [12]:
# Yields a None for the header to skip use MapPartitions with index.
# Diff from flatMap - flattens the output.
def func1(row):
    arr = row.split(",")
    if (arr[0] != 'Year of Birth' and arr[4] != 'Count'):
        return (arr[0],arr[4])
rdd0.map(func1).take(10)

[None,
 ('2016', '172'),
 ('2016', '112'),
 ('2016', '104'),
 ('2016', '99'),
 ('2016', '99'),
 ('2016', '79'),
 ('2016', '59'),
 ('2016', '57'),
 ('2016', '56')]

### 2) mapPartitionsWithIndex

In [15]:
def func1(index,rows):
    if index==0:
        next(rows)
    import csv 
    reader = csv.reader(rows)
    for field in reader:
        yield (field[0],field[4])
rdd0.mapPartitionsWithIndex(func1).take(10)

[('2016', '172'),
 ('2016', '112'),
 ('2016', '104'),
 ('2016', '99'),
 ('2016', '99'),
 ('2016', '79'),
 ('2016', '59'),
 ('2016', '57'),
 ('2016', '56'),
 ('2016', '56')]

In [45]:
rdd0.getNumPartitions()

2

### 3) Filter

In [21]:
### Get the rank 1 names for all the years
def func1(index,rows):
    if index == 0:
        next(rows)
    import csv
    reader = csv.reader(rows)
    for fields in reader :
        yield (fields[0],(fields[3],fields[5]))
rdd0.mapPartitionsWithIndex(func1).filter(lambda x :x[1][1] == '1').collect()

[('2016', ('Olivia', '1')),
 ('2016', ('Ava', '1')),
 ('2016', ('Isabella', '1')),
 ('2016', ('Olivia', '1')),
 ('2016', ('Ethan', '1')),
 ('2016', ('Noah', '1')),
 ('2016', ('Liam', '1')),
 ('2016', ('Joseph', '1')),
 ('2015', ('Olivia', '1')),
 ('2015', ('Madison', '1')),
 ('2015', ('Isabella', '1')),
 ('2015', ('Emma', '1')),
 ('2015', ('Olivia', '1')),
 ('2015', ('Jayden', '1')),
 ('2015', ('Noah', '1')),
 ('2015', ('Liam', '1')),
 ('2015', ('David', '1')),
 ('2014', ('Liam', '1')),
 ('2013', ('Olivia', '1')),
 ('2013', ('David', '1')),
 ('2012', ('CHLOE', '1')),
 ('2012', ('EMMA', '1')),
 ('2012', ('RYAN', '1')),
 ('2012', ('JOSEPH', '1')),
 ('2011', ('SOPHIA', '1')),
 ('2011', ('MADISON', '1')),
 ('2011', ('ISABELLA', '1')),
 ('2011', ('ESTHER', '1')),
 ('2011', ('ETHAN', '1')),
 ('2011', ('JAYDEN', '1')),
 ('2011', ('MICHAEL', '1'))]

### 4) GroupBy
The “groupBy”  transformation will group the data in the original RDD. It creates a set of key value pairs, where the key is output of a user function, and the value is all items for which the function yields this key.

In [31]:
## Group names by first three letters
def func1(index,rows):
    if index == 0:
        next(rows)
    import csv
    reader = csv.reader(rows)
    for fields in reader :
        yield fields[3]
rdd1= rdd0.mapPartitionsWithIndex(func1).groupBy(lambda x :x[0:3])

In [32]:
print([(k, list(v)) for (k, v) in rdd1.take(10)])

[('EIT', ['EITAN', 'EITAN']), ('LEE', ['LEELA']), ('GUS', ['GUSTAVO', 'GUSTAVO']), ('KEI', ['KEIRA', 'KEITH', 'KEILY', 'KEITH']), ('MAS', ['MASON', 'MASON', 'MASON', 'MASON', 'MASON', 'MASON', 'MASON', 'MASON']), ('Das', ['Dashiell']), ('KIE', ['KIERAN', 'KIERAN']), ('Ada', ['Ada', 'Adam', 'Adam', 'Adam', 'Adam', 'Ada', 'Adam', 'Adam', 'Adam', 'Adam', 'Ada', 'Adam', 'Adam', 'Adan', 'Adam', 'Ada', 'Ada', 'Adam', 'Adam', 'Adan', 'Adam']), ('HAD', ['HADLEY', 'HADASSAH', 'HADASSAH', 'HADASSA']), ('Cad', ['Caden', 'Caden', 'Caden'])]


### 5) ReduceByKey/GroupByKey 
ReduceByKey has less shuffling

In [None]:
## Get the average count for a name over all the years 
def func1(index,rows):
    if index == 0:
        next(rows)
    import csv
    reader = csv.reader(rows)
    for fields in reader :
        yield (fields[3],int(fields[4]))
def func2(values):
    return sum(values)/len(values)
rdd1= rdd0.mapPartitionsWithIndex(func1).groupByKey().mapValues(func2).map(lambda x: (x[1],x[0])).sortByKey(False).take(10)

In [40]:
rdd1

[(251.66666666666666, 'Moshe'),
 (231.0, 'MOSHE'),
 (178.36363636363637, 'Ethan'),
 (177.25, 'Chaya'),
 (176.0, 'CHAYA'),
 (174.5, 'CAMILA'),
 (165.33333333333334, 'Miriam'),
 (163.14285714285714, 'ETHAN'),
 (159.07142857142858, 'Liam'),
 (158.0, 'CHAIM')]

In [42]:
def func1(index,rows):
    if index == 0:
        next(rows)
    import csv
    reader = csv.reader(rows)
    for fields in reader :
        yield (fields[3],(int(fields[4]),1))
rdd1= rdd0.mapPartitionsWithIndex(func1).reduceByKey(lambda x,y:(x[0]+y[0], x[1]+y[1])).mapValues(lambda x: x[0]/x[1]).map(lambda x: (x[1],x[0])).sortByKey(False).take(10)

In [43]:
rdd1

[(251.66666666666666, 'Moshe'),
 (231.0, 'MOSHE'),
 (178.36363636363637, 'Ethan'),
 (177.25, 'Chaya'),
 (176.0, 'CHAYA'),
 (174.5, 'CAMILA'),
 (165.33333333333334, 'Miriam'),
 (163.14285714285714, 'ETHAN'),
 (159.07142857142858, 'Liam'),
 (158.0, 'CHAIM')]

### 6) Distinct

In [44]:
rdd0.mapPartitionsWithIndex(func1).distinct().count()

10266

### 7)Action: Reduce

In [46]:
num_rdd = sc.parallelize(range(1,100))
num_rdd.reduce(lambda x,y: x+y)

4950

### 8) Min and Max

In [50]:
def func1(index,rows):
    if index == 0:
        next(rows)
    import csv
    reader = csv.reader(rows)
    for fields in reader :
        yield (int(fields[5]))
rdd1= rdd0.mapPartitionsWithIndex(func1)

In [51]:
rdd1.max()

102

In [52]:
rdd1.min()

1

### DataFrames

In [53]:
df = spark.read.load('Popular_Baby_Names.csv', format='csv', 
                           header=True, inferSchema=True)
df.show()

+-------------+------+--------------------+------------------+-----+----+
|Year of Birth|Gender|           Ethnicity|Child's First Name|Count|Rank|
+-------------+------+--------------------+------------------+-----+----+
|         2016|FEMALE|ASIAN AND PACIFIC...|            Olivia|  172|   1|
|         2016|FEMALE|ASIAN AND PACIFIC...|             Chloe|  112|   2|
|         2016|FEMALE|ASIAN AND PACIFIC...|            Sophia|  104|   3|
|         2016|FEMALE|ASIAN AND PACIFIC...|             Emily|   99|   4|
|         2016|FEMALE|ASIAN AND PACIFIC...|              Emma|   99|   4|
|         2016|FEMALE|ASIAN AND PACIFIC...|               Mia|   79|   5|
|         2016|FEMALE|ASIAN AND PACIFIC...|         Charlotte|   59|   6|
|         2016|FEMALE|ASIAN AND PACIFIC...|             Sarah|   57|   7|
|         2016|FEMALE|ASIAN AND PACIFIC...|            Hannah|   56|   8|
|         2016|FEMALE|ASIAN AND PACIFIC...|          Isabella|   56|   8|
|         2016|FEMALE|ASIAN AND PACIFI