# RDD

* Immutable: operations on it generate another RDD
* Transformation x Action: Lazy x in-time computation
* RDD = Low Level. Dataset and DataFrame: High Level
* Dataset: Java/Scala. Dataframe

In [5]:
from pyspark.sql import SparkSession

# Cria (ou recupera) uma SparkSession
spark = SparkSession.builder \
    .appName("ExemploRDD") \
    .getOrCreate()


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/08 08:13:48 WARN Utils: Your hostname, WIN-NJTBBD8GS0T, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/09/08 08:13:48 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/08 08:13:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/09/08 08:13:53 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [None]:
# Usando o SparkContext para paralelizar uma lista

# O método parallelize é uma forma de criar um RDD a partir de uma coleção local (por exemplo, uma lista em Python).
# Ele faz duas coisas principais:
# Divide (paraleliza) os dados locais em partições.
# Distribui essas partições entre os nós do cluster Spark.
rdd = spark.sparkContext.parallelize([i for i in range(11)])

print(rdd.take(3))
print(rdd.top(3))
print(rdd.collect())

# max, min, count, mean, stdev, ... same!

[0, 1, 2]


                                                                                

[10, 9, 8]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [None]:
# Filter

rdd.filter(lambda x: x % 2 == 0).collect()

                                                                                

[0, 2, 4, 6, 8, 10]

In [14]:
# Sample

rdd.sample(False, 0.5).collect()

[1, 2, 3, 5, 6, 7, 10]

In [15]:
# Map

rdd.map(lambda x: x ** 2).collect()

                                                                                

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100]

In [21]:
nums1 = spark.sparkContext.parallelize([i for i in range(11)])
nums2 = spark.sparkContext.parallelize([i for i in range(5, 15)])

# No duplicates removal!
print(nums1.union(nums2).collect())

# Slow! Deal with comparing data from different slices
print(nums1.intersection(nums2).collect())

# Subtraction
print(nums1.subtract(nums2).collect())

# Cartesian product
cartesian = nums1.cartesian(nums2)
print(cartesian.collect())
print(cartesian.countByValue())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]


                                                                                

[5, 6, 7, 8, 9, 10]


                                                                                

[0, 1, 2, 3, 4]


                                                                                

[(0, 5), (0, 6), (0, 7), (0, 8), (0, 9), (0, 10), (0, 11), (0, 12), (0, 13), (0, 14), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (1, 12), (1, 13), (1, 14), (2, 5), (2, 6), (2, 7), (2, 8), (2, 9), (2, 10), (2, 11), (2, 12), (2, 13), (2, 14), (3, 5), (3, 6), (3, 7), (3, 8), (3, 9), (3, 10), (3, 11), (3, 12), (3, 13), (3, 14), (4, 5), (4, 6), (4, 7), (4, 8), (4, 9), (4, 10), (4, 11), (4, 12), (4, 13), (4, 14), (5, 5), (5, 6), (5, 7), (5, 8), (5, 9), (5, 10), (5, 11), (5, 12), (5, 13), (5, 14), (6, 5), (6, 6), (6, 7), (6, 8), (6, 9), (6, 10), (6, 11), (6, 12), (6, 13), (6, 14), (7, 5), (7, 6), (7, 7), (7, 8), (7, 9), (7, 10), (7, 11), (7, 12), (7, 13), (7, 14), (8, 5), (8, 6), (8, 7), (8, 8), (8, 9), (8, 10), (8, 11), (8, 12), (8, 13), (8, 14), (9, 5), (9, 6), (9, 7), (9, 8), (9, 9), (9, 10), (9, 11), (9, 12), (9, 13), (9, 14), (10, 5), (10, 6), (10, 7), (10, 8), (10, 9), (10, 10), (10, 11), (10, 12), (10, 13), (10, 14)]




defaultdict(<class 'int'>, {(0, 5): 1, (0, 6): 1, (0, 7): 1, (0, 8): 1, (0, 9): 1, (0, 10): 1, (0, 11): 1, (0, 12): 1, (0, 13): 1, (0, 14): 1, (1, 5): 1, (1, 6): 1, (1, 7): 1, (1, 8): 1, (1, 9): 1, (1, 10): 1, (1, 11): 1, (1, 12): 1, (1, 13): 1, (1, 14): 1, (2, 5): 1, (2, 6): 1, (2, 7): 1, (2, 8): 1, (2, 9): 1, (2, 10): 1, (2, 11): 1, (2, 12): 1, (2, 13): 1, (2, 14): 1, (3, 5): 1, (3, 6): 1, (3, 7): 1, (3, 8): 1, (3, 9): 1, (3, 10): 1, (3, 11): 1, (3, 12): 1, (3, 13): 1, (3, 14): 1, (4, 5): 1, (4, 6): 1, (4, 7): 1, (4, 8): 1, (4, 9): 1, (4, 10): 1, (4, 11): 1, (4, 12): 1, (4, 13): 1, (4, 14): 1, (5, 5): 1, (5, 6): 1, (5, 7): 1, (5, 8): 1, (5, 9): 1, (5, 10): 1, (5, 11): 1, (5, 12): 1, (5, 13): 1, (5, 14): 1, (6, 5): 1, (6, 6): 1, (6, 7): 1, (6, 8): 1, (6, 9): 1, (6, 10): 1, (6, 11): 1, (6, 12): 1, (6, 13): 1, (6, 14): 1, (7, 5): 1, (7, 6): 1, (7, 7): 1, (7, 8): 1, (7, 9): 1, (7, 10): 1, (7, 11): 1, (7, 12): 1, (7, 13): 1, (7, 14): 1, (8, 5): 1, (8, 6): 1, (8, 7): 1, (8, 8): 1, (8, 9): 

                                                                                

In [25]:
# Keys and values
purchases = spark.sparkContext.parallelize([(1, 100), (2, 300), (3, 120), (4, 250), (5, 78)])
debts = spark.sparkContext.parallelize([(1, 20), (2, 300)])
# Available commands
# purchases.keys(), purchases.values()
# purchases.countByKey(), purchases.countByValues()
# sum_1_purchases = purchases.mapValues(lambda s: s + 1)
# purchases.join(debts).collect()
# purchases.subtractByKey(debts).collect()

                                                                                

[(3, 120), (4, 250), (5, 78)]