In [1]:
import findspark

In [2]:
findspark.init()

In [3]:
import pyspark

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession. \
        builder. \
        master('local[*]'). \
        appName('RDD_Transformations_Actions'). \
        getOrCreate()

#### http://localhost:4040/jobs/ (port my vary)

In [6]:
sc = spark.sparkContext

## Create RDD

### From file

In [7]:
lines_rdd = sc.textFile('file:///home/rameshbabug/Documents/projects/internal/spark-playground/src/data/sample.log')

### From List

In [8]:
list = ['Data Engineer', 'Software Engineer', 'Data Analyst', 'Data Scientist', 'UI Developer', 'QA Engineer']

In [9]:
role_rdd = sc.parallelize(list, 2)

## Transformations

### Map

#### Map transformation returns a new RDD by applying a function to each element of this RDD

#### e.g. 1

In [10]:
number_list = [1,2,3,4,5]

In [11]:
number_rdd = sc.parallelize(number_list, 2)

In [12]:
number_sqrt_rdd = number_rdd.map(lambda n: n*n)

#### map transformation will operate on element by element. Number of inputs = Number of outputs. Here we are using anonymous function on map transformation i.e. lambda

In [13]:
number_sqrt_rdd.collect()

[1, 4, 9, 16, 25]

#### e.g. 2

In [14]:
str_list = ['Data Engineer', 'Software Engineer', 'Data Analyst', 'Data Scientist', 'UI Developer', 'QA Engineer']

In [15]:
str_rdd = sc.parallelize(str_list, 2)

In [16]:
def toUpper(element):
    if element is not None:
        element = element.upper()
    return element

In [17]:
upper_str_rdd = str_rdd.map(toUpper)

#### Here we are calling function in map transformation, It will iterate on element by element

#### e.g.3

In [18]:
emp_data_rdd = sc.textFile('file:///home/rameshbabug/Documents/projects/internal/spark-playground/src/data/emp_data.csv')

In [20]:
print("Input RDD Rows Count: {}".format(emp_data_rdd.count()))

Input RDD Rows Count: 7


In [21]:
transformed_rdd = emp_data_rdd.map(lambda x: x.split(","))

In [22]:
transformed_rdd.collect()

[['ID', 'NAME', 'SALARY', 'CITY'],
 ['1', 'ABC', '2000', 'Hyd'],
 ['2', 'DEF', '3000', 'Bang'],
 ['3', 'GHI', '4000', 'Pune'],
 ['4', 'PQR', '5000', 'Delhi'],
 ['5', 'XYZ', '3500', 'Hyd'],
 ['6', 'PQR', '4500', 'Pune']]

In [23]:
print("Transformed RDD Rows Count: {}".format(transformed_rdd.count()))

Transformed RDD Rows Count: 7


### Flat Map

#### flatMap is similar to map, because it applies a function to all elements in a RDD. But, flatMap flattens the results

#### It will return sequence rather than single element, map will return single element 

#### Input elements N and out elements will be M

#### e.g.1

In [31]:
print("Input RDD Rows Count: {}".format(emp_data_rdd.count()))

Input RDD Rows Count: 7


In [32]:
transformed_rdd = emp_data_rdd.flatMap(lambda x: x.split(","))

In [33]:
print(transformed_rdd.collect())

['ID', 'NAME', 'SALARY', 'CITY', '1', 'ABC', '2000', 'Hyd', '2', 'DEF', '3000', 'Bang', '3', 'GHI', '4000', 'Pune', '4', 'PQR', '5000', 'Delhi', '5', 'XYZ', '3500', 'Hyd', '6', 'PQR', '4500', 'Pune']


In [34]:
print("Transformed RDD Rows Count: {}".format(transformed_rdd.count()))

Transformed RDD Rows Count: 28


#### e.g.2

In [24]:
lines_rdd = sc.textFile('file:///home/rameshbabug/Documents/projects/internal/spark-playground/src/data/simple_text.txt')

In [25]:
words_rdd = lines_rdd.flatMap(lambda line: line.split(" "))

In [26]:
words_rdd.take(5)

['2012-02-03', '18:35:34', 'SampleClass6', '[INFO]', 'everything']

In [27]:
list = ["1,2,3", "4,5,6", "7,8,9"]

In [28]:
numbers_rdd = sc.parallelize(list,3)

In [29]:
numbers_rdd_flat = numbers_rdd.flatMap(lambda str_number: str_number.split(","))

In [30]:
numbers_rdd_flat.collect()

['1', '2', '3', '4', '5', '6', '7', '8', '9']

### Filter

#### Create a new RDD bye returning only the elements that satisfy filter condition

#### e.g. 1

In [35]:
str_list = ['Data Engineer', 'Software Engineer', 'Data Analyst', 'Data Scientist', 'UI Developer', 'QA Engineer']

In [36]:
str_rdd = sc.parallelize(str_list, 2)

In [37]:
data_str_list = str_rdd.filter(lambda line: 'Data' in line)

In [38]:
data_str_list.collect()

['Data Engineer', 'Data Analyst', 'Data Scientist']

#### e.g.2

In [39]:
transformed_rdd = emp_data_rdd.filter(lambda x: 'Hyd' in x)

In [40]:
print(transformed_rdd.collect())

['1,ABC,2000,Hyd', '5,XYZ,3500,Hyd']


### Map Partitions

#### If we want to operation at partition level instead of at element level then we can use map partition. As map partition work at partition level we will get good performace as well.  It's input is the set of current partitions its output will be another set of partitions

#### mapPartitions() can be used as an alternative to map() and foreach(). mapPartitions() can be called  for each partitions while map() and foreach() is called for each elements in an RDD

#### Spark mapPartitions() provides a facility to do heavy initializations (for example Database connection) once for each partition instead of doing it on every DataFrame row. This helps the performance of the job when you dealing with heavy-weighted initialization on larger datasets.

In [45]:
def process_partition_sum(partition):
    sum = 0
    for element in partition:
        sum = sum + element
    yield sum
    

In [46]:
def process_partition(partition):
    yield sum(partition)

In [47]:
def process_partition_size(partition):
    element_list = []
    for element in partition:
        element_list.append(len(element))
    yield element_list

In [59]:
def filter_out_2_from_partition(list_of_lists):
    itr = []
    for sub_list in list_of_lists:
        itr.append([x for x in sub_list if x != 2])
    return iter(itr)

#### e.g.1

In [41]:
number_list = [1,2,3,4,5,6,7,8,9]

In [42]:
num_rdd = sc.parallelize(number_list, 3)

In [43]:
num_rdd.getNumPartitions()

3

In [48]:
num_process_rdd = num_rdd.mapPartitions(process_partition_sum)

In [49]:
num_process_rdd.collect() # 1 + 2+ 3 =6, 4+5+6=15, 7+8+9=24

[6, 15, 24]

In [50]:
num_process_rdd = num_rdd.mapPartitions(process_partition)

In [51]:
num_process_rdd.collect()

[6, 15, 24]

#### e.g.2

In [52]:
text_rdd = sc.textFile('file:///home/rameshbabug/Documents/projects/internal/spark-playground/src/data/simple_text.txt', )

In [53]:
text_rdd.getNumPartitions()

2

In [54]:
text_rdd.collect()

['2012-02-03 18:35:34 SampleClass6 [INFO] everything normal for id 577725851',
 '2012-02-03 18:35:34 SampleClass4 [FATAL] system problem at id 1991281254',
 '2012-02-03 18:35:34 SampleClass3 [DEBUG] detail for id 1304807656',
 '2012-02-03 18:35:34 SampleClass3 [WARN] missing id 423340895',
 '2012-02-03 18:35:34 SampleClass5 [TRACE] verbose detail for id 2082654978',
 '2012-02-03 18:35:34 SampleClass0 [ERROR] incorrect id  1886438513',
 '2012-02-03 18:35:34 SampleClass9 [TRACE] verbose detail for id 438634209',
 '2012-02-03 18:35:34 SampleClass8 [DEBUG] detail for id 2074121310',
 '2012-02-03 18:35:34 SampleClass0 [TRACE] verbose detail for id 1505582508',
 '2012-02-03 18:35:34 SampleClass0 [TRACE] verbose detail for id 1903854437',
 '2012-02-03 18:35:34 SampleClass7 [DEBUG] detail for id 915853141']

In [55]:
str_overview_rdd = text_rdd.mapPartitions(process_partition_size)

In [56]:
str_overview_rdd.collect()

[[74, 72, 65, 60, 73, 65], [72, 65, 73, 73, 64]]

#### e.g.3

In [57]:
number_list = [ [1, 2, 3], [3, 2, 4], [5, 2, 7] ]

In [58]:
number_list_rdd = sc.parallelize(number_list)

In [60]:
filtered_lists = number_list_rdd.mapPartitions(filter_out_2_from_partition)

In [61]:
print("Filtered List: {}".format(filtered_lists.collect()))

Filtered List: [[1, 3], [3, 4], [5, 7]]


### mapPartitionsWithIndex

#### Similar to mapPartitions, but also provides a function with an int value to indicate the index position of the partition.

#### https://stackoverflow.com/questions/33655920/when-to-use-mapparitions-and-mappartitionswithindex

In [66]:
def process_partition_sum_with_index(index, partition):
    sum = 0
    for element in partition:
        sum = sum + element
    yield (index, sum)

In [62]:
number_list = [1, 2, 3, 4, 5, 6, 7, 8, 9]

In [63]:
num_rdd = sc.parallelize(number_list, 3)

In [64]:
print("Input RDD: {}".format(num_rdd.collect()))

Input RDD: [1, 2, 3, 4, 5, 6, 7, 8, 9]


In [65]:
print("Number of partitions: {}".format(num_rdd.getNumPartitions()))

Number of partitions: 3


In [67]:
num_process_rdd = num_rdd.mapPartitionsWithIndex(process_partition_sum_with_index)

In [68]:
print("Processed RDD With Index: {}".format(num_process_rdd.collect()))

Processed RDD With Index: [(0, 6), (1, 15), (2, 24)]


### Sampling on RDD

#### Return a random sample subset RDD of the input RDD

#### e.g.1

In [69]:
number_list = [1,2,3,4,5,6,7,8,9]

In [70]:
num_rdd = sc.parallelize(number_list, 3)

In [71]:
num_rdd.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [72]:
sample_num_rdd = num_rdd.sample(withReplacement=False, fraction=0.5, seed=1)

In [73]:
sample_num_rdd.collect()

[1, 3, 5, 8]

#### e.g.2

In [74]:
text_rdd = sc.textFile('file:///home/rameshbabug/Documents/projects/internal/spark-playground/src/data/simple_text.txt')

In [75]:
text_rdd.collect()

['2012-02-03 18:35:34 SampleClass6 [INFO] everything normal for id 577725851',
 '2012-02-03 18:35:34 SampleClass4 [FATAL] system problem at id 1991281254',
 '2012-02-03 18:35:34 SampleClass3 [DEBUG] detail for id 1304807656',
 '2012-02-03 18:35:34 SampleClass3 [WARN] missing id 423340895',
 '2012-02-03 18:35:34 SampleClass5 [TRACE] verbose detail for id 2082654978',
 '2012-02-03 18:35:34 SampleClass0 [ERROR] incorrect id  1886438513',
 '2012-02-03 18:35:34 SampleClass9 [TRACE] verbose detail for id 438634209',
 '2012-02-03 18:35:34 SampleClass8 [DEBUG] detail for id 2074121310',
 '2012-02-03 18:35:34 SampleClass0 [TRACE] verbose detail for id 1505582508',
 '2012-02-03 18:35:34 SampleClass0 [TRACE] verbose detail for id 1903854437',
 '2012-02-03 18:35:34 SampleClass7 [DEBUG] detail for id 915853141']

In [76]:
sample_text_rdd = text_rdd.sample(withReplacement=False, fraction=0.5, seed=1)

In [77]:
sample_text_rdd.collect()

['2012-02-03 18:35:34 SampleClass6 [INFO] everything normal for id 577725851',
 '2012-02-03 18:35:34 SampleClass3 [DEBUG] detail for id 1304807656',
 '2012-02-03 18:35:34 SampleClass8 [DEBUG] detail for id 2074121310',
 '2012-02-03 18:35:34 SampleClass7 [DEBUG] detail for id 915853141']

### Union

#### Return the union of two RDDs

In [78]:
num_rdd1 = sc.parallelize([1,2,3,4,5],2)

In [79]:
num_rdd2 = sc.parallelize([6,7,8,9,1],2)

In [80]:
num_rdd3 = num_rdd1.union(num_rdd2)

In [81]:
num_rdd3.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 1]

### Intersection

#### Similar to union but return the intersection of two RDDs

In [82]:
num_rdd1 = sc.parallelize([1,2,3,4,1])

In [83]:
num_rdd2 = sc.parallelize([6,7,8,6,1])

In [84]:
num_rdd3 = num_rdd1.intersection(num_rdd2)

In [87]:
# num_rdd3.take(2)

### Distinct

#### Return a new RDD with distinct elements within a source RDD

In [88]:
num_list = [1,2,3,4,5,6,7,8,9,1,3,6,1,9]

In [89]:
num_rdd = sc.parallelize(num_list, 3)

In [90]:
num_rdd.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 3, 6, 1, 9]

In [91]:
distinct_num_rdd = num_rdd.distinct()

In [92]:
# distinct_num_rdd.collect()

### Groupbykey

In [None]:
num_rdd = sc.parallelize(["one","two","three","four","two"])

In [None]:
num_words_rdd = num_rdd.flatMap(lambda line: line.split(","))

In [None]:
kv_num_rdd = num_words_rdd.map(lambda element: (element,1)).groupByKey()

In [None]:
# kv_num_rdd.collect()

In [None]:
# for key, value in kv_num_rdd.collect():
#    print('{0} -> {1}'.format(key, value))

In [None]:
# text_words_kv_rdd = text_words_rdd.map(lambda element: (element,1))

In [None]:
# text_words_kv_rdd.collect()

In [None]:
kv_rdd = 

In [None]:
# start_time = time.time()
text_lines_rdd = sc.textFile('file:///home/rameshbabug/Documents/projects/internal/spark-playground/src/data/sample.log')
text_words_rdd = text_lines_rdd.flatMap(lambda line: line.split(" "))
text_words_kv_rdd = text_words_rdd.map(lambda element: (element,1))
result_words_kv_rdd = text_words_kv_rdd.groupByKey().map(lambda (x,y): (x, sum(y)))
result_words_kv_rdd.collect()
# end_time = time.time() - start_time

### Collect

In [None]:
lines_rdd = sc.textFile('file:///home/rameshbabug/Documents/projects/internal/spark-playground/src/data/sample.log')

In [None]:
info_lines_rdd = lines_rdd.filter(lambda line: 'WARN' in line)

In [None]:
info_lines_rdd.collect() # We need to be careful while using collect as it tries to load everything into driver

In [None]:
exception_lines = info_lines_rdd.filter(lambda line: 'ERROR' in line)

In [None]:
for line in exception_lines.collect():
    print(line)