In [1]:
from pyspark import SparkContext, SparkConf

In [2]:
conf = SparkConf().setAppName("HelloRDD").setMaster("local[*]").set("spark.log.level", "WARN")
sc = SparkContext(conf=conf)

25/03/13 16:17:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
print("spark.version ==", sc.version)

spark.version == 3.5.3


# Transformations

## Map

In [4]:
rdd = sc.parallelize(["a", "b", "c", "d", "e", "f", "x", "y", "z"], 3)
mappedRDD = rdd.map(lambda e: e.upper())

In [5]:
mappedRDD.collect()

                                                                                

['A', 'B', 'C', 'D', 'E', 'F', 'X', 'Y', 'Z']

## Filter

In [6]:
rdd = sc.parallelize(["a", "b", "c", "d", "e", "f", "x", "y", "z"], 3)
filteredRDD = rdd.filter(lambda e: e  < "f")

In [7]:
filteredRDD.collect()

['a', 'b', 'c', 'd', 'e']

## flatMap

In [8]:
rdd = sc.parallelize(["some text here", "other text over there", "and text", "short", "and a longer one"], 3)
flatmappedRDD = rdd.flatMap(lambda e: e.split())

In [9]:
flatmappedRDD.collect()

['some',
 'text',
 'here',
 'other',
 'text',
 'over',
 'there',
 'and',
 'text',
 'short',
 'and',
 'a',
 'longer',
 'one']

## mapPartitions

In [10]:
rdd = sc.parallelize(["a", "b", "c", "d", "e", "f", "x", "y", "z"], 3)
mapPartitionsRDD = rdd.mapPartitions(lambda p: ["Hello"])

In [11]:
mapPartitionsRDD.collect()

['Hello', 'Hello', 'Hello']

## mapPartitionsWithIndex

In [12]:
rdd = sc.parallelize(["a", "b", "c", "d", "e", "f", "x", "y", "z"], 3)
mapPartitionsRDD = rdd.mapPartitionsWithIndex(lambda i, p: [i])

In [13]:
mapPartitionsRDD.collect()

[0, 1, 2]

## Sample

In [14]:
rdd = sc.parallelize(["a", "b", "c", "d", "e", "f", "x", "y", "z"], 3)
sampleRDD = rdd.sample(False, 0.8)

In [15]:
sampleRDD.collect()

['a', 'd', 'e', 'f', 'y', 'z']

## Union

In [16]:
rdd = sc.parallelize(["a", "b", "c", "d", "e", "f", "x", "y", "z"], 3)
rdd2 = sc.parallelize(["A", "B", "C", "D", "E", "F", "X", "Y", "Z"], 3)
unionRDD = rdd.union(rdd2)

In [17]:
unionRDD.collect()

['a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'x',
 'y',
 'z',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'X',
 'Y',
 'Z']

## Intersection

In [18]:
rdd = sc.parallelize(["a", "b", "c", "d", "e", "f"], 3)
rdd2 = sc.parallelize(["d", "e", "f", "x", "y", "z"], 3)
intersectionRDD = rdd.intersection(rdd2)

In [19]:
intersectionRDD.collect()

['d', 'f', 'e']

## Distinct

In [20]:
rdd = sc.parallelize(["a", "b", "c", "c", "c", "b", "b", "a"], 3)
distinctRDD = rdd.distinct()

In [21]:
distinctRDD.collect()

['b', 'a', 'c']

## GroupByKey

In [22]:
rdd = sc.parallelize([(1, "a"), (2, "b"), (3, "c"), (1, "d"), (2, "e")], 3)
groupbyRDD = rdd.groupByKey()

In [23]:
groupbyRDD.collect()

[(3, <pyspark.resultiterable.ResultIterable at 0x10555a900>),
 (1, <pyspark.resultiterable.ResultIterable at 0x10526f110>),
 (2, <pyspark.resultiterable.ResultIterable at 0x10526f250>)]

In [24]:
for (x, y) in groupbyRDD.collect():
    print(x, sorted(y))

3 ['c']
1 ['a', 'd']
2 ['b', 'e']


## ReduceByKey

In [25]:
rdd = sc.parallelize([(1, "a"), (2, "b"), (3, "c"), (1, "d"), (2, "e")], 3)
reducebyRDD = rdd.reduceByKey(lambda a, b: a + b)

In [26]:
reducebyRDD.collect()

[(3, 'c'), (1, 'ad'), (2, 'be')]

## AggregateByKey

In [27]:
rdd = sc.parallelize([("a", 1), ("b", 2), ("c", 3), ("a", 4), ("b", 5)], 3)
aggregateByRDD = rdd.aggregateByKey(0, lambda acc, val: acc + val, lambda acc1, acc2: acc1 + acc2)

In [28]:
aggregateByRDD.collect()

[('b', 7), ('a', 5), ('c', 3)]

## SortByKey

In [29]:
rdd = sc.parallelize([("a", 1), ("b", 2), ("c", 3), ("a", 4), ("b", 5)], 3)
sortedRDD = rdd.sortByKey()

In [30]:
sortedRDD.collect()

[('a', 1), ('a', 4), ('b', 2), ('b', 5), ('c', 3)]

## Join

In [31]:
rdd = sc.parallelize([(1, "a"), (2, "b"), (3, "c")])
rdd2 = sc.parallelize([(2, "B"), (1, "A"), (3, "C")])
joinedRDD = rdd.join(rdd2)

In [32]:
joinedRDD.collect()

[(1, ('a', 'A')), (2, ('b', 'B')), (3, ('c', 'C'))]

## Coalesce

In [33]:
rdd = sc.parallelize(["a", "b", "c", "d", "e", "f", "g", "h", "i"], 4)
coalesceRDD = rdd.coalesce(2)

In [34]:
print(rdd.getNumPartitions())
print(coalesceRDD.getNumPartitions())

4
2


## Repartition

In [35]:
rdd = sc.parallelize(["a", "b", "c", "d", "e", "f", "g", "h", "i"], 2)
repartitionRDD = rdd.repartition(4)

In [36]:
print(rdd.getNumPartitions())
print(repartitionRDD.getNumPartitions())

2
4


In [37]:
repartitionRDD2 = rdd.repartition(2)

In [38]:
print(rdd.getNumPartitions())
print(repartitionRDD.getNumPartitions())
print(repartitionRDD2.getNumPartitions())

2
4
2


# Actions

## Reduce

In [39]:
rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9])
sumOfItems = rdd.reduce(lambda x, y: x + y)

In [40]:
sumOfItems

45

## Collect

In [41]:
rdd = sc.parallelize(["a", "b", "c", "d", "e", "f", "g", "h", "i"], 3)
collectResult = rdd.collect()

In [42]:
collectResult

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']

In [43]:
type(collectResult)

list

## Count

In [44]:
rdd = sc.parallelize(["a", "b", "c", "d", "e", "f", "g", "h", "i"], 3)
countResult = rdd.count()

In [45]:
countResult

9

## CountByKey

In [46]:
rdd = sc.parallelize([("a", 1), ("b", 2), ("c", 3), ("a", 4), ("b", 5)], 3)
countResult = rdd.countByKey()

In [47]:
countResult

defaultdict(int, {'a': 2, 'b': 2, 'c': 1})

## First

In [48]:
rdd = sc.parallelize(["a", "b", "c", "d", "e", "f", "g", "h", "i"], 3)
firstResult = rdd.first()

In [49]:
firstResult

'a'

## Take

In [50]:
rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9], 3)
takeResult = rdd.take(3)

In [51]:
takeResult

[1, 2, 3]

## TakeSample

In [52]:
rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9], 3)
takeResult = rdd.takeSample(False, 3)

In [53]:
takeResult

[1, 7, 9]

## TakeOrdered

In [54]:
rdd = sc.parallelize([1, 9, 2, 8, 3, 7, 4, 6, 5], 3)
takeResult = rdd.takeOrdered(3)

In [55]:
type(rdd)

pyspark.rdd.RDD

In [56]:
takeResult

[1, 2, 3]

## Foreach

In [57]:
rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9], 3)
rdd.foreach(lambda e: print("Next item is ", e))

Next item is  4
Next item is  5
Next item is  6
Next item is  7
Next item is  8
Next item is  9
Next item is  1
Next item is  2
Next item is  3


## SaveAs

In [58]:
import os
import shutil

In [59]:
textFile = "result/txt/textFile"

if os.path.exists(textFile):
    shutil.rmtree(textFile)

In [60]:
rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9], 3)
rdd.saveAsTextFile("result/txt/textFile")