# Intro to Spark

In [0]:
rdd = sc.parallelize([1,2,3,4])
squared_rdd = rdd.map(lambda x: x*x).collect()

squared_rdd

In [0]:
rdd1 = sc.parallelize([1,2,3,4]).map(lambda x: x*x)
rdd2 = sc.parallelize([1,2,3,4]).map(lambda x: x*x*x)

rdd2.collect() # Only rdd2 is processed, since rdd1 doesn’t have an action

In [0]:
rdd = sc.parallelize([1,2,3,4]).map(lambda x: x*x)

print('Count: ', rdd.count())
rdd.collect()  # rdd will be processed twice

In [0]:
rdd = sc.parallelize([1,2,3,4]).map(lambda x: x*x).persist()

print('Count: ', rdd.count())
rdd.collect()   # rdd will be processed only once

In [0]:
from operator import add

rdd = sc.parallelize(["Hello world this is Doron", "I'm doing a test on this platform", "To count Hello world words"])

word_count = rdd\
              .flatMap(lambda line: line.split(" "))\
              .map(lambda word: (word, 1))\
              .reduceByKey(add)\
              .collect()
# .reduceByKey(add) is similar to .reduceByKey(lambda a,b: a+b)
word_count

In [0]:
sorted_words = sorted(word_count, key=lambda x: x[1], reverse=True)
top_5_words = sorted_words[:5]
top_5_words

## SparkSQL

In [0]:
df = spark.read.csv("/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv", header=True)
df.show()

In [0]:
df.select(df['cut'], 'price', df['price'] + 10).show(5)

In [0]:
df.filter(df['price'] > 300).show(5)

In [0]:
df.groupBy("color").count().show()

In [0]:
df.createOrReplaceTempView("diamonds")

In [0]:
sql_df = spark.sql("""
SELECT * FROM diamonds WHERE price > 330
""")
sql_df.show(5)

In [0]:
sql_df.write.format("json").save("diamonds_output.json")