In [None]:
! pip3 install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 40 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 50.6 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=08834f2f05b2eded8672976e4a238a8fc619ba4eeeabf0f4b240b1610bca64f7
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


In [None]:
from pyspark import SparkContext, SparkConf

In [None]:
conf = SparkConf()\
.setAppName('RDD Practice')\
.setMaster('local[*]')

sc = SparkContext(conf=conf)

In [None]:
# Generate random data
import random

random_list = random.sample(range(0, 100), 10)
print(random_list)

[81, 14, 15, 72, 83, 21, 45, 31, 49, 91]


In [None]:
# Create RDD
# in parallelize method specify data and number of partitions
rdd_1 = sc.parallelize(random_list, numSlices=4)

print(rdd_1.collect())
# collect() brings all data, use only small amount of data
# insted we can use take() function
print(rdd_1.take(4))

[81, 14, 15, 72, 83, 21, 45, 31, 49, 91]
[81, 14, 15, 72]


In [None]:
# Data distribution in partitions

# getNumPartitions function show how many partitions are there
print(rdd_1.getNumPartitions())

# glom function show how the data is distributed among partitions
print(rdd_1.glom().collect())

# 2 partitions
print(rdd_1.glom().take(2))

4
[[81, 14], [15, 72], [83, 21], [45, 31, 49, 91]]
[[81, 14], [15, 72]]


In [None]:
# count

rdd_1.count()

10

In [None]:
# actions run on driver node

In [None]:
rdd_1.first()

81

In [None]:
rdd_1.top(3)

# actions run on driver node
# if run this line on huge data with 100s of nodes cluster,
# all data must come to driver node, then ordered by asc order
# then selected top 3 values
# so program will crash

[91, 83, 81]

In [None]:
rdd_1.distinct().collect()

[72, 81, 21, 45, 49, 14, 15, 83, 31, 91]

In [None]:
print(rdd_1.collect())
print(rdd_1.map(lambda x: x*2).collect())

[81, 14, 15, 72, 83, 21, 45, 31, 49, 91]
[162, 28, 30, 144, 166, 42, 90, 62, 98, 182]


In [None]:
rdd_filter = rdd_1.filter(lambda x: x%4==0)
rdd_filter.take(10)

[72]

In [None]:
rdd_filter.count()

1

In [None]:
rdd_filter.getNumPartitions()

4

In [None]:
rdd_filter.glom().collect()

[[], [72], [], []]

In [None]:
repartioned_filtered_rdd = rdd_filter.coalesce(1)

In [None]:
repartioned_filtered_rdd.glom().collect()

[[72]]

In [None]:
rdd_1.map(lambda x: [x+1, x+2]).collect()

[[82, 83],
 [15, 16],
 [16, 17],
 [73, 74],
 [84, 85],
 [22, 23],
 [46, 47],
 [32, 33],
 [50, 51],
 [92, 93]]

In [None]:
rdd_flatMap = rdd_1.flatMap(lambda x: [x+1, x+2])
rdd_flatMap.glom().collect()

[[82, 83, 15, 16],
 [16, 17, 73, 74],
 [84, 85, 22, 23],
 [46, 47, 32, 33, 50, 51, 92, 93]]

In [None]:
rdd_flatMap_repartitioned = rdd_flatMap.repartition(5)

In [None]:
rdd_flatMap_repartitioned.glom().collect()

[[],
 [82, 83, 15, 16],
 [84, 85, 22, 23, 46, 47, 32, 33, 50, 51, 92, 93],
 [16, 17, 73, 74],
 []]

In [None]:
rdd_1.reduce(lambda x,y: x+y)

502

In [None]:
rdd_1.glom().collect()

[[81, 14], [15, 72], [83, 21], [45, 31, 49, 91]]

In [None]:
# mapPartitions()

def my_udf(p):
    sum = 0
    for i in p:
        sum+=i
    yield sum

rdd_1.mapPartitions(my_udf).collect()

[95, 87, 104, 216]

### Avanced RDD operations

In [None]:
r1 = sc.parallelize([1,2,3,4,5])
r2 = sc.parallelize([4,5,6,7,8])

In [None]:
r1.union(r2).collect()

[1, 2, 3, 4, 5, 4, 5, 6, 7, 8]

In [None]:
rdd_inter = r1.intersection(r2)

In [None]:
rdd_inter.collect()

[4, 5]

In [None]:
rdd_inter.getNumPartitions()

4

In [None]:
rdd_inter.glom().collect()

[[4], [5], [], []]

In [None]:
rdd_repartitioned = rdd_inter.repartition(1)
rdd_repartitioned.glom().collect()

[[4, 5]]

In [None]:
rdd_repartitioned2 = rdd_inter.coalesce(1)
rdd_repartitioned2.glom().collect()

[[4, 5]]

In [None]:
# number of empty partitions
c = 0
for part in rdd_inter.glom().collect():
    if len(part) == 0:
        c += 1
print(c)

2


In [None]:
# no parallel processing
# this method should only be used if the resulting array is expected to be small
# as all data is loaded into the driver's node
r1.takeSample(False, 5)

[3, 4, 5, 1, 2]

In [None]:
# no parallel processing
r1.takeOrdered(4) # avoid if data volume is huge

[1, 2, 3, 4]

In [None]:
r1.takeOrdered(4, key = lambda x: -x) # avoid if data volume is huge

[5, 4, 3, 2]

In [None]:
r1.reduce(lambda x, y : x+y)

15

In [None]:
rdd_pair = sc.parallelize([(1,3),(3,7),(3,9),(1,5), (5,6)],2)
rdd_pair.take(5)

[(1, 3), (3, 7), (3, 9), (1, 5), (5, 6)]

In [None]:
rdd_pair.getNumPartitions()

2

In [None]:
rdd_pair.glom().collect()

[[(1, 3), (3, 7)], [(3, 9), (1, 5), (5, 6)]]

In [None]:
rdd_pair_rbk = rdd_pair.reduceByKey(lambda x,y: x+y)
rdd_pair_rbk.collect()

[(1, 8), (3, 16), (5, 6)]

In [None]:
rdd_pair_rbk.glom().collect()

[[], [(1, 8), (3, 16), (5, 6)]]

In [None]:
rdd_pair_rbk_repartitioned = rdd_pair_rbk.coalesce(1)

In [None]:
rdd_pair_rbk_repartitioned.glom().collect()

[[(1, 8), (3, 16), (5, 6)]]

In [None]:
# better visualization
import pandas as pd

df = pd.DataFrame({
    'key': rdd_pair.keys().collect(),
    'value': rdd_pair.values().collect()
})

df

Unnamed: 0,key,value
0,1,3
1,3,7
2,3,9
3,1,5
4,5,6


In [None]:
# the same result with pandas groupby with the same logic
df.groupby(df.key).sum()

Unnamed: 0_level_0,value
key,Unnamed: 1_level_1
1,8
3,16
5,6


In [None]:
# sortByKey()
rdd_pair\
.reduceByKey(lambda x,y: x+y)\
.sortByKey(True)\
.collect()

[(1, 8), (3, 16), (5, 6)]

In [None]:
dict(rdd_pair.countByKey())

{1: 2, 3: 2, 5: 1}

In [None]:
# it will generate huge traffic over the network
# NOT RECOMMENDED for huge data
rdd_group = rdd_pair.groupByKey()
rdd_group.getNumPartitions()

2

In [None]:
for i in rdd_group.collect():
    print(i[0], [v for v in i[1]])

1 [3, 5]
3 [7, 9]
5 [6]


In [None]:
rdd_pair.collect()
rdd_pair_2 = sc.parallelize([(1, 3), (3, 7), (3, 9), (1, 5), (5, 6),(5,9)])

In [None]:
# lookup(key)
rdd_pair.lookup(5)

[6]

In [None]:
rdd_pair_rbk.collect()

[(1, 8), (3, 16), (5, 6)]

In [None]:
rdd_pair_rbk.persist()

PythonRDD[183] at collect at <ipython-input-108-1aca81820f45>:2

In [None]:
rdd_pair_rbk.cache()

PythonRDD[183] at collect at <ipython-input-108-1aca81820f45>:2

In [None]:
# !apt-get install texlive texlive-xetex texlive-latex-extra pandoc
# !pip install pypandoc

In [None]:
# DIFFERENCE B/W CACHE AMD PERSIS
# https://stackoverflow.com/questions/26870537/what-is-the-difference-between-cache-and-persist#:~:text=The%20only%20difference%20between%20cache,MEMORY_ONLY_SER%2C%20MEMORY_AND_DISK_SER%2C%20DISK_ONLY).

In [None]:
# default storage level for cache is MEMORY_ONLY
# but in persist, we can specify storage level
# if data very big and total RAM volume is small
# then we can use MEMORY_AND_DISK storage level
# to store data RAM and Disc

In [None]:
from pyspark import StorageLevel

huge_rdd = sc.parallelize([1,2,3,4,5,6,7,8,9],4)

huge_rdd.persist(StorageLevel.MEMORY_AND_DISK)

ParallelCollectionRDD[223] at readRDDFromFile at PythonRDD.scala:274

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# !cp drive/MyDrive/'Colab Notebooks'/WorkingwithRDD.ipynb ./

In [None]:
# !jupyter nbconvert --to PDF "WorkingwithRDD.ipynb"