## Spark context

In [1]:
sc

## Init

In [2]:
data = list(range(20))
myrdd = sc.parallelize(data)
myrdd

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:287

## Transformations

In [3]:
sq = lambda x: x**2

In [4]:
sq(2)

4

In [5]:
squared = myrdd.map(sq)
squared

PythonRDD[1] at RDD at PythonRDD.scala:53

In [7]:
squared.collect()

[0,
 1,
 4,
 9,
 16,
 25,
 36,
 49,
 64,
 81,
 100,
 121,
 144,
 169,
 196,
 225,
 256,
 289,
 324,
 361]

## FlatMap

In [8]:
withmap = myrdd.map(lambda x: (x * x, x * x * x))
withmap.collect()

[(0, 0),
 (1, 1),
 (4, 8),
 (9, 27),
 (16, 64),
 (25, 125),
 (36, 216),
 (49, 343),
 (64, 512),
 (81, 729),
 (100, 1000),
 (121, 1331),
 (144, 1728),
 (169, 2197),
 (196, 2744),
 (225, 3375),
 (256, 4096),
 (289, 4913),
 (324, 5832),
 (361, 6859)]

In [9]:
withflatmap = myrdd.flatMap(lambda x: (x * x, x * x * x))
withflatmap.collect()

[0,
 0,
 1,
 1,
 4,
 8,
 9,
 27,
 16,
 64,
 25,
 125,
 36,
 216,
 49,
 343,
 64,
 512,
 81,
 729,
 100,
 1000,
 121,
 1331,
 144,
 1728,
 169,
 2197,
 196,
 2744,
 225,
 3375,
 256,
 4096,
 289,
 4913,
 324,
 5832,
 361,
 6859]

In [10]:
divina = [
    "Nel mezzo del cammin di nostra vita",
    "mi ritrovai per una selva oscura",
    "ché la diritta via era smarrita."
]
divinardd = sc.parallelize(divina)
words = divinardd.flatMap(lambda x:x.split())
words.collect()

['Nel',
 'mezzo',
 'del',
 'cammin',
 'di',
 'nostra',
 'vita',
 'mi',
 'ritrovai',
 'per',
 'una',
 'selva',
 'oscura',
 'ché',
 'la',
 'diritta',
 'via',
 'era',
 'smarrita.']

## Filter

In [11]:
even = myrdd.filter(lambda x: x % 2 == 0)
even.collect()

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]

## Sample

In [22]:
sample = myrdd.sample(False, 0.20)
sample.collect()

[6, 7, 8, 14, 19]

## Distinct

In [26]:
distinct = sc.parallelize([1, 2, 2, 3, 3, 3, 4, 4, 4, 4]).distinct()
distinct.collect()

[4, 1, 2, 3]

## Set operations

In [27]:
myrdd2 = sc.parallelize(range(10, 30))
union = myrdd.union(myrdd2)
union.collect()



[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29]

In [29]:
intersection = myrdd.intersection(myrdd2)
intersection.collect()

                                                                                

[16, 17, 10, 18, 11, 19, 12, 13, 14, 15]

In [30]:
subtract = myrdd.subtract(myrdd2)
subtract.collect()

                                                                                

[0, 8, 1, 9, 2, 3, 4, 5, 6, 7]

In [33]:
cartesian = myrdd.cartesian(myrdd2)
cartesian.collect()

                                                                                

[(0, 10),
 (1, 10),
 (2, 10),
 (3, 10),
 (4, 10),
 (0, 11),
 (0, 12),
 (1, 11),
 (1, 12),
 (2, 11),
 (2, 12),
 (3, 11),
 (3, 12),
 (4, 11),
 (4, 12),
 (0, 13),
 (0, 14),
 (1, 13),
 (1, 14),
 (2, 13),
 (2, 14),
 (3, 13),
 (3, 14),
 (4, 13),
 (4, 14),
 (0, 15),
 (1, 15),
 (2, 15),
 (3, 15),
 (4, 15),
 (0, 16),
 (0, 17),
 (1, 16),
 (1, 17),
 (2, 16),
 (2, 17),
 (3, 16),
 (3, 17),
 (4, 16),
 (4, 17),
 (0, 18),
 (0, 19),
 (1, 18),
 (1, 19),
 (2, 18),
 (2, 19),
 (3, 18),
 (3, 19),
 (4, 18),
 (4, 19),
 (0, 20),
 (1, 20),
 (2, 20),
 (3, 20),
 (4, 20),
 (0, 21),
 (0, 22),
 (1, 21),
 (1, 22),
 (2, 21),
 (2, 22),
 (3, 21),
 (3, 22),
 (4, 21),
 (4, 22),
 (0, 23),
 (0, 24),
 (1, 23),
 (1, 24),
 (2, 23),
 (2, 24),
 (3, 23),
 (3, 24),
 (4, 23),
 (4, 24),
 (0, 25),
 (1, 25),
 (2, 25),
 (3, 25),
 (4, 25),
 (0, 26),
 (0, 27),
 (1, 26),
 (1, 27),
 (2, 26),
 (2, 27),
 (3, 26),
 (3, 27),
 (4, 26),
 (4, 27),
 (0, 28),
 (0, 29),
 (1, 28),
 (1, 29),
 (2, 28),
 (2, 29),
 (3, 28),
 (3, 29),
 (4, 28),
 (4, 29),


## Actions

In [37]:
myrdd.cartesian(myrdd2).count()

                                                                                

400

In [38]:
withflatmap.countByValue()

defaultdict(int,
            {0: 2,
             1: 2,
             4: 1,
             8: 1,
             9: 1,
             27: 1,
             16: 1,
             64: 2,
             25: 1,
             125: 1,
             36: 1,
             216: 1,
             49: 1,
             343: 1,
             512: 1,
             81: 1,
             729: 1,
             100: 1,
             1000: 1,
             121: 1,
             1331: 1,
             144: 1,
             1728: 1,
             169: 1,
             2197: 1,
             196: 1,
             2744: 1,
             225: 1,
             3375: 1,
             256: 1,
             4096: 1,
             289: 1,
             4913: 1,
             324: 1,
             5832: 1,
             361: 1,
             6859: 1})

In [40]:
myrdd.take(3)

[0, 1, 2]

In [41]:
myrdd.top(3)

[19, 18, 17]