In [6]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("RDDExamples") \
    .master("local[*]") \
    .getOrCreate()

sc = spark.sparkContext

In [7]:
# From a Python list
nums = sc.parallelize([1, 2, 3, 4,5])

# From a text File (one line per record)
# text_rdd = sc.textFile("/path/to/file.txt")

## ACTION METHODS

Only when calling the functions the operation performs (since operations on RDD are lazy)

In [9]:
nums.collect()

[1, 2, 3, 4, 5]

In [10]:
nums.count()

5

In [12]:
nums.take(3)     # Taking top three

[1, 2, 3]

In [13]:
# Transformations are lazy
squares = nums.map(lambda x: x*x)       #map -> transformation
even_squares = squares.filter(lambda x: x%2 == 0)   #filter -> transformation

# To get the output(ACTION)
even_squares.collect()

[4, 16]

In [15]:
# 1. Create an RDD from a list (Parallelization)
lines = sc.parallelize([
    "spark makes big data simple",
    "rdds are resilient distributed datasets",
    "spark runs fast"
])

# transformations -> builts a (DAG-Directed Acyclic Graph) which is a logical execution plan

word_counts = (lines

               #Splits each line into words.
               # flatMap returns a flattened list of all words across all lines.
               .flatMap(lambda line: line.split())    # DAG
               # results -> ["spark", "makes", "big", "data", "simple", "rdds", "are", "resilient", "distributed", "datasets", "spark", "runs", "fast"]


               # Converts each word to lowercase and pairs it with 1.
               .map(lambda w: (w.lower(), 1))          # DAG
               # results -> [("spark, 1"), ("makes", 1), ("big",1),......]


               # Adds up the values (counts how many times each word appears).
               # reduce the data by the keys
               .reduceByKey(lambda a, b: a + b))     # DAG
               # results -> [("spark",2), ("makes", 1), ("big",1),......]


# (Actual order may vary — RDDs are not ordered.)
word_counts.take(10)     # sample result
#  ACTION -> Triggers the execution

[('big', 1),
 ('are', 1),
 ('resilient', 1),
 ('distributed', 1),
 ('datasets', 1),
 ('runs', 1),
 ('fast', 1),
 ('spark', 2),
 ('makes', 1),
 ('data', 1)]