In [None]:
#Resilient Distributed Datasets (RDDs)

# This is a fundamental data structure of Spark, It is an immutable(read-only) distributed collection of objects, which are
# stored in memory or on disks of different machines of a cluster. Each dataset in RDD is divided into logical partitions,
# which may be run and operate on different nodes(machines) to do parallel processing on a cluster.

# When we read a file with "sc"(Spark core) object it will be by default an RDD.

# Since this are immutable You cannot change an original RDD, but you can create new RDDs by performing coarse-grain 
# operations, like transformations, on an existing RDD.

# An RDD in Spark can be cached and used again for future transformations, which is a huge benefit for users. 

# RDDs are said to be lazily evaluated, i.e., they delay the evaluation until it is really needed. This saves a lot of time
# and improves efficiency.

# RDDs are fault tolerant as well, hence in case of any failure, they recover automatically. 

In [None]:
# Job=====>Stages====>Task(individual tranformations are task like map filter)

# when you invoke an action on RDD, a job is created. Jobs are the main function that has to be done and is submitted to 
# Spark. The jobs are divided into stages depending on how they can be separately carried out (mainly on shuffle boundaries).
# Then, these stages are divided into tasks.

# Transformations are 2 types they are
# 1. Narrow (example map, filter..etc.,)
# 2. Wide (example join, groupby,groupbykey)

# when we apply a tranformation on a data which has partitions. the data in one partitions wont transfer/communicate
# to other partitions. this is called narrow transformations.
# Here all these be done in one stage. that is one after other it will be done(map -->filter--> collect)

# communication/ tranfer will happen in Wide tranformations.
# here there will be a stages all partitions will complete stage1 then internal suffule rdd will happen then goes to wide 
# tranformations and tranfer of data will happen so it goes to stage2 (stage 1'map -->filter-->' stage 2'groupby')


# wide tranformations are costly
 
# RDD is dont have strip schema, unless it runs it dont know  anything. we cannot optimize the data. that is why sparksql or
# dataframes comes into picture.

In [None]:
# There are two ways to create RDDs: 
# 1) parallelizing an existing collection in your driver program, 
# 2) referencing a dataset in an external storage system, such as a shared filesystem, HDFS, HBase, or any data source 
#    offering a Hadoop InputFormat.

In [1]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]").appName('Krishna').getOrCreate()
sc =spark.sparkContext

In [6]:
data=[1,2,3,4,5,6,7,8,9,10,11,12]
rd=sc.parallelize(data)

In [7]:
rd1=sc.parallelize(data,3) # the 5 is the no. of partitions should do on these

In [8]:
rd1.getNumPartitions()

3

In [None]:
#Create empty RDD with partition
rdd2 = sc.parallelize([],10) #This creates 10 partitions

In [58]:
e1 = sc.emptyRDD() # creating an empty RDD

In [59]:
#Create RDD from external Data source
rdd2 = sc.textFile("/path/textFile.txt")
rdd= sc.textFile('path',4,use_unicode=False).repartition(6) 

EmptyRDD[30] at emptyRDD at <unknown>:0

In [60]:
e1.isEmpty() # to check rdd is empty or not

True

In [None]:
#Reads entire file into a RDD as single record.
rdd3 = sc.wholeTextFiles("/path/textFile.txt")

In [None]:
Transformations
    map(func)
    filter(func)
    flatMap(func)
    mapPartitions(func)
    mapPartitionsWithIndex(func)
    sample(withReplacement, fraction, seed)
    union(otherDataset)
    intersection(otherDataset)
    distinct([numPartitions]))
    groupByKey([numPartitions])
    reduceByKey(func, [numPartitions])
    aggregateByKey(zeroValue)(seqOp, combOp, [numPartitions])
    sortByKey([ascending], [numPartitions])
    join(otherDataset, [numPartitions])
    cogroup(otherDataset, [numPartitions])
    cartesian(otherDataset)
    pipe(command, [envVars])
    coalesce(numPartitions)
    repartition(numPartitions)
    repartitionAndSortWithinPartitions(partitioner)

In [None]:
Actions
    reduce(func)
    collect()
    count()
    first()
    take(n)
    takeSample(withReplacement, num, [seed])
    takeOrdered(n, [ordering])
    saveAsTextFile(path)
    countByKey()
    foreach(func)

In [None]:
Persistence
    persist()
    cache()

In [None]:
unpersist()
destroy()

In [None]:
Shared variables for two common usage patterns: 
    1) broadcast variables 
    2) accumulators.
    
    sc.broadcast(v)

    sc.accumulator(v)
    

In [None]:
# repartition()- to increase or decrease the RDD/DataFrame partitions
# coalesce() - to only decrease the number of partitions in an efficient way.



In [None]:
# Types of RDD:
    
# PairRDDFunctions or PairRDD – Pair RDD is a key-value pair This is mostly used RDD type, 
# ShuffledRDD – 
# DoubleRDD – 
# SequenceFileRDD – 
# HadoopRDD – 
# ParallelCollectionRDD – 

In [None]:
# Shuffle Operations:
    