# Spark RDDs Operations

### ✅ Step 1: Creating SparkContext

In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("RDD_Examples") \
    .master("yarn") \
    .getOrCreate()

sc = spark.sparkContext  # Now get SparkContext safely


25/06/24 05:35:27 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


### ✅ Step 2: Dummy Datasets

In [6]:
# Dataset 1: Names
rdd1 = sc.parallelize(["Alice", "Bob", "Charlie", "David", "Eve", "Frank"])

# Dataset 2: Numbers
rdd2 = sc.parallelize([1, 2, 3, 4, 5, 6, 3, 2])

# Dataset 3: Key-Value pairs (name, score)
rdd_kv = sc.parallelize([
    ("Alice", 85),
    ("Bob", 90),
    ("Charlie", 70),
    ("Alice", 95),
    ("Bob", 88)
])


### 🔄 Transformations

##### 1 - Map

In [7]:
rdd2.map(lambda x: x*x).collect()

                                                                                

[1, 4, 9, 16, 25, 36, 9, 4]

##### 2. FlatMap

In [8]:
rdd1.flatMap(lambda x: list(x)).collect()

                                                                                

['A',
 'l',
 'i',
 'c',
 'e',
 'B',
 'o',
 'b',
 'C',
 'h',
 'a',
 'r',
 'l',
 'i',
 'e',
 'D',
 'a',
 'v',
 'i',
 'd',
 'E',
 'v',
 'e',
 'F',
 'r',
 'a',
 'n',
 'k']

##### 3. MapPartitions

In [12]:
def process_partition(iterator):
    yield sum(iterator)
rdd2.mapPartitions(process_partition).collect()

[10, 16]

##### 4. Filter

In [13]:
rdd2.filter(lambda x: x > 3).collect()

[4, 5, 6]

##### 5. Sample

In [14]:
rdd1.sample(withReplacement=False, fraction=0.5).collect()

['Alice', 'Charlie', 'Frank']

##### 6. Union

In [15]:
rdd2.union(sc.parallelize([10,20])).collect()

[1, 2, 3, 4, 5, 6, 3, 2, 10, 20]

##### 7. Intersection

In [17]:
rdd2.intersection(sc.parallelize([2, 3, 10])).collect()

                                                                                

[2, 3]

##### 8. Distinct

In [18]:
rdd2.distinct().collect()

[2, 4, 6, 1, 3, 5]

##### 9. GroupByKey

In [19]:
rdd_kv.groupByKey().mapValues(list).collect()

[('Alice', [85, 95]), ('Bob', [90, 88]), ('Charlie', [70])]

##### 10. ReduceByKey

In [20]:
rdd_kv.reduceByKey(lambda a, b: a + b).collect()

[('Alice', 180), ('Bob', 178), ('Charlie', 70)]

##### 11. Join

In [21]:
rdd_a = sc.parallelize([("Alice", 85), ("Bob", 90)])
rdd_b = sc.parallelize([("Alice", "F"), ("Bob", "M")])

rdd_a.join(rdd_b).collect()

                                                                                

[('Alice', (85, 'F')), ('Bob', (90, 'M'))]

##### 12. Coalesce (reduce # of partitions)

In [22]:
rdd2.coalesce(2).glom().collect()
# Combines into 2 partitions (for performance tuning)

[[1, 2, 3, 4], [5, 6, 3, 2]]

### ⚡ Actions

##### 1. Count

In [24]:
rdd2.count()
# Total number of elements

8

##### 2. Collect

In [25]:
rdd1.collect()
# Returns the whole dataset

['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank']

##### 3. Take

In [26]:
rdd2.take(3)
# First 3 elements

[1, 2, 3]

### 4. Top

In [27]:
rdd2.top(3)
# Top 3 values

[6, 5, 4]

##### 5. CountByValue

In [28]:
rdd2.countByValue()

defaultdict(int, {1: 1, 2: 2, 3: 2, 4: 1, 5: 1, 6: 1})

##### 6. Reduce

In [29]:
rdd2.reduce(lambda a, b: a + b)
# Sum of all elements

26

##### 7. Fold

In [30]:
rdd2.fold(0, lambda a, b: a + b)
# Same as reduce but starts with initial zero value

26

##### 8. Aggregate

In [31]:
rdd2.aggregate((0, 0),
               lambda acc, val: (acc[0] + val, acc[1] + 1),
               lambda a, b: (a[0] + b[0], a[1] + b[1]))
# (sum, count)

(26, 8)

##### 9. Foreach

In [33]:
def print_elem(x):
    print(f"Element: {x}")

rdd1.foreach(print_elem)
# Prints each element (note: won’t show output in collect/driver unless used with `foreachPartition`)

In [None]:
spark.stop()