# Narrow Transformation

# Map

In [1]:
baby_names = sc.textFile("baby_names.csv")

In [2]:
rows = baby_names.map(lambda line: line.split(','))

In [3]:
rows.collect()

[['State', 'Gender', 'Year', 'Name', 'Frequency'],
 ['CA', 'female', '2012', 'Sophia', '3643'],
 ['CA', 'female', '2011', 'Sophia', '3566'],
 ['CA', 'female', '2013', 'Sophia', '3465'],
 ['CA', 'female', '2010', 'Isabella', '3366'],
 ['CA', 'female', '2010', 'Sophia', '3360'],
 ['CA', 'male', '2010', 'Jacob', '3351'],
 ['CA', 'female', '2014', 'Sophia', '3179'],
 ['CA', 'male', '2011', 'Jacob', '3171'],
 ['CA', 'male', '2010', 'Daniel', '3133'],
 ['CA', 'female', '2012', 'Isabella', '3109'],
 ['CA', 'female', '2011', 'Isabella', '3060'],
 ['CA', 'male', '2012', 'Jacob', '2978'],
 ['CA', 'female', '2015', 'Sophia', '2954'],
 ['CA', 'male', '2011', 'Daniel', '2907'],
 ['CA', 'male', '2013', 'Jacob', '2887'],
 ['CA', 'male', '2012', 'Jayden', '2868'],
 ['CA', 'female', '2015', 'Mia', '2861'],
 ['CA', 'male', '2010', 'Anthony', '2860'],
 ['CA', 'female', '2016', 'Mia', '2794'],
 ['CA', 'female', '2013', 'Isabella', '2793']]

In [4]:
for row in rows.take(rows.count()):
    print(row[3])

Name
Sophia
Sophia
Sophia
Isabella
Sophia
Jacob
Sophia
Jacob
Daniel
Isabella
Isabella
Jacob
Sophia
Daniel
Jacob
Jayden
Mia
Anthony
Mia
Isabella


# Flat Map

In [6]:
sc.parallelize([1,2,3]).flatMap(lambda item: [item, item, item]).collect()

[1, 1, 1, 2, 2, 2, 3, 3, 3]

In [7]:
# while map assigns separately
sc.parallelize([1,2,3]).map(lambda item: [item, item, item]).collect()

[[1, 1, 1], [2, 2, 2], [3, 3, 3]]

# Filter

In [8]:
rows.filter(lambda row: "male" in row).collect()

[['CA', 'male', '2010', 'Jacob', '3351'],
 ['CA', 'male', '2011', 'Jacob', '3171'],
 ['CA', 'male', '2010', 'Daniel', '3133'],
 ['CA', 'male', '2012', 'Jacob', '2978'],
 ['CA', 'male', '2011', 'Daniel', '2907'],
 ['CA', 'male', '2013', 'Jacob', '2887'],
 ['CA', 'male', '2012', 'Jayden', '2868'],
 ['CA', 'male', '2010', 'Anthony', '2860']]

# Union

In [10]:
rdd1 = sc.parallelize(((1, 'jan', 2016), (3, 'nov', 2014), (16, 'feb', 2014)))
rdd2 = sc.parallelize(((5, 'dec', 2014), (17, 'sep', 2015)))
rdd3 = sc.parallelize(((6, 'dec', 2011), (16, 'may', 2015)))

In [12]:
rddUnion = rdd1.union(rdd2).union(rdd3)

In [17]:
rddUnion.collect()

[(1, 'jan', 2016),
 (3, 'nov', 2014),
 (16, 'feb', 2014),
 (5, 'dec', 2014),
 (17, 'sep', 2015),
 (6, 'dec', 2011),
 (16, 'may', 2015)]

# Wide Transformation

# Intersection

In [18]:
rdd1 = sc.parallelize(range(1, 10))
rdd2 = sc.parallelize(range(5, 15))

In [19]:
rdd1.intersection(rdd2).collect()

[6, 8, 5, 7, 9]

# Distinct

In [20]:
uniques = sc.parallelize(range(1, 9))
duplicates = sc.parallelize(range(5, 15))

In [21]:
mixedSet = uniques.union(duplicates)

In [22]:
mixedSet.distinct().collect()

[2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13]

# groupByKey

In [23]:
namesToYear = rows.map(lambda line: (line[3], line[2])).groupByKey()

In [25]:
namesToYear.map(lambda item: {item[0]: list(item[1])}).collect()

[{'Name': ['Year']},
 {'Sophia': ['2012', '2011', '2013', '2010', '2014', '2015']},
 {'Isabella': ['2010', '2012', '2011', '2013']},
 {'Jacob': ['2010', '2011', '2012', '2013']},
 {'Daniel': ['2010', '2011']},
 {'Jayden': ['2012']},
 {'Mia': ['2015', '2016']},
 {'Anthony': ['2010']}]

# reduceByKey

In [26]:
filtered_rows = baby_names.filter(lambda line: 'Frequency' not in line).map(lambda line: line.split(','))

In [27]:
filtered_rows.map(lambda item: (item[3], int(item[4]))).reduceByKey(lambda v1, v2: v1+v2).collect()

[('Sophia', 20167),
 ('Isabella', 12328),
 ('Jacob', 12387),
 ('Daniel', 6040),
 ('Jayden', 2868),
 ('Mia', 5655),
 ('Anthony', 2860)]

# aggregateByKey

In [30]:
filtered_rows = baby_names.filter(lambda line: 'Frequency' not in line).map(lambda line: line.split(','))

In [31]:
filtered_rows = filtered_rows.map(lambda item: (item[3], int(item[4])))

In [32]:
filtered_rows.aggregateByKey(0, lambda k,v: int(v)+k, lambda v,k: k+v).collect()

[('Sophia', 20167),
 ('Isabella', 12328),
 ('Jacob', 12387),
 ('Daniel', 6040),
 ('Jayden', 2868),
 ('Mia', 5655),
 ('Anthony', 2860)]

# sortByKey

In [33]:
filtered_rows = baby_names.filter(lambda line: 'Year' not in line).map(lambda line: line.split(','))

In [34]:
filtered_rows = filtered_rows.map(lambda item: (item[3], int(item[2])))

In [35]:
filtered_rows.sortByKey().collect()

[('Anthony', 2010),
 ('Daniel', 2010),
 ('Daniel', 2011),
 ('Isabella', 2010),
 ('Isabella', 2012),
 ('Isabella', 2011),
 ('Isabella', 2013),
 ('Jacob', 2010),
 ('Jacob', 2011),
 ('Jacob', 2012),
 ('Jacob', 2013),
 ('Jayden', 2012),
 ('Mia', 2015),
 ('Mia', 2016),
 ('Sophia', 2012),
 ('Sophia', 2011),
 ('Sophia', 2013),
 ('Sophia', 2010),
 ('Sophia', 2014),
 ('Sophia', 2015)]

In [36]:
filtered_rows.sortByKey(False).collect()

[('Sophia', 2012),
 ('Sophia', 2011),
 ('Sophia', 2013),
 ('Sophia', 2010),
 ('Sophia', 2014),
 ('Sophia', 2015),
 ('Mia', 2015),
 ('Mia', 2016),
 ('Jayden', 2012),
 ('Jacob', 2010),
 ('Jacob', 2011),
 ('Jacob', 2012),
 ('Jacob', 2013),
 ('Isabella', 2010),
 ('Isabella', 2012),
 ('Isabella', 2011),
 ('Isabella', 2013),
 ('Daniel', 2010),
 ('Daniel', 2011),
 ('Anthony', 2010)]

# Join

In [37]:
 names1 = sc.parallelize(("abe", "abby", "apple")).map(lambda a: (a, 1))

In [38]:
names2 = sc.parallelize(("apple", "beatty", "beatrice")).map(lambda a: (a, 1))

In [39]:
names1.join(names2).collect()

[('apple', (1, 1))]

In [40]:
names1.leftOuterJoin(names2).collect()

[('abby', (1, None)), ('abe', (1, None)), ('apple', (1, 1))]

In [41]:
names1.rightOuterJoin(names2).collect()

[('apple', (1, 1)), ('beatty', (None, 1)), ('beatrice', (None, 1))]