In [1]:
import math
from pyspark import SparkContext
sc = SparkContext()


70709.97100833799


In [2]:
# make an RDD
nums = sc.parallelize(range(100_000), numSlices=100)
# square all the values
doubled = nums.map(lambda n: n*2)
# filter and double
total = doubled.filter(lambda n: n%4==0).reduce(lambda a,b: a+b)
print(math.sqrt(total))

70709.97100833799


## Converting from Pyspark to conventional Python function

Keep in mind, the Pyspark way and map reduce ways are still faster on large datasets!

In [9]:
def big_sum(ls):
    '''Same as above, example 1'''
    ls = [x*2 for x in ls if (x*2)%4 == 0]
    summation = sum(ls)
    return math.sqrt(summation)

big_sum(range(100_000))

70709.97100833799

In [10]:
def f(ls):
    '''same as above, example 2'''
    s = 0
    for i in ls:
        # only the values whose square is divisible by 4, gets added
        if (i*2)%4 == 0:
            s += (i*2)
    return math.sqrt(s)
    
    
print(f(range(100000)))

70709.97100833799


### Difference Between map and flatmap

In [11]:
values = sc.parallelize([1, 2, 3, 4], 2)
print(values.map(range).collect())  # ranges an RDD as a 2D list, and in this case what goes in each is a range object, after each value in original is given to the range function
# [[0], [0, 1], [0, 1, 2], [0, 1, 2, 3]]
print(values.flatMap(range).collect())  # flattens the data to a 1D list
# [0, 0, 1, 0, 1, 2, 0, 1, 2, 3]

[range(0, 1), range(0, 2), range(0, 3), range(0, 4)]
[0, 0, 1, 0, 1, 2, 0, 1, 2, 3]
