In [None]:
# map vs. flatMap

# map transformation applies a function to each row in a DataFrame/Dataset and returns the new transformed Dataset.
# 1 => 1
# flatMap transformation flattens the DataFrame/Dataset after applying the function on every element and returns a new transformed Dataset. 
# The returned Dataset will return more rows than the current DataFrame. It is also referred to as a one-to-many transformation function
# 1 => Many
# One of the use cases of flatMap() is to flatten column which contains arrays, list, or any nested collection

import pyspark

sc = pyspark.SparkContext.getOrCreate()
rdd = sc.parallelize([("name", "joe,sarah,tom"), ("car", "hyundai")])
result = rdd.map(lambda x: x[1].split(","))
# print(result.collect())
# [['joe', 'sarah', 'tom'], ['hyundai']]

rdd = sc.parallelize([("name", "joe,sarah,tom"), ("car", "hyundai")])
result = rdd.flatMap(lambda x: x[1].split(","))
# print(result.collect())
# ['joe', 'sarah', 'tom', 'hyundai']


test_file = "file:///home/jovyan/work/sample/lorem_ipsum.txt"
lines = sc.textFile(test_file)
words = lines.flatMap(lambda x: x.split())
# word_count = words.countByValue()
# print(word_count)
# for word, count in word_count.items():
#     print(f"{word}: {count}")
    
    
# # How about sort by key?
word_count = words.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
sorted_word_count = word_count.map(lambda x: (x[1], x[0])).sortByKey()
for count, word in sorted_word_count.collect():
    print(f"{count}: {word}")