In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]") \
                    .appName('SparkByExamples.com') \
                    .getOrCreate()
my_list = [11, 12, 17, 14, 10, 13]
rdd = spark.sparkContext.parallelize(my_list)
rdd

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274

In [3]:
sc = spark.sparkContext
rdd = sc.parallelize([1, 2, 3, 4, 4, 4, 2], 4)

In [4]:
print("created numbers rdd with %d partitions" % rdd.getNumPartitions())

created numbers rdd with 4 partitions


In [5]:
myCollection = "Spark Unified Computation Engine : Big Data Processing Made Simple".split(" ")
myCollection

['Spark',
 'Unified',
 'Computation',
 'Engine',
 ':',
 'Big',
 'Data',
 'Processing',
 'Made',
 'Simple']

In [6]:
words = sc.parallelize(myCollection, 2)
print("created words rdd with %d partitions" % words.getNumPartitions())

created words rdd with 2 partitions


In [7]:
def startsWithS(individual):
  return individual.startswith("S")

In [8]:
# Apply filter transformation
filteredWords = words.filter(lambda word: startsWithS(word))
filteredWords.collect()

['Spark', 'Simple']

In [9]:
# Apply map transformation

mapTransformation = words.map(lambda word: (word, word[0], word.startswith("S")))
display(mapTransformation.collect())

[('Spark', 'S', True),
 ('Unified', 'U', False),
 ('Computation', 'C', False),
 ('Engine', 'E', False),
 (':', ':', False),
 ('Big', 'B', False),
 ('Data', 'D', False),
 ('Processing', 'P', False),
 ('Made', 'M', False),
 ('Simple', 'S', True)]

In [10]:
filteredWords = mapTransformation.filter(lambda record: record[2])
display(filteredWords.take(5))

[('Spark', 'S', True), ('Simple', 'S', True)]

In [11]:
# Perform sorting
words.sortBy(lambda word: len(word) * -1).collect()

['Computation',
 'Processing',
 'Unified',
 'Engine',
 'Simple',
 'Spark',
 'Data',
 'Made',
 'Big',
 ':']

In [12]:
# COMMAND ----------

words.sortBy(lambda word: len(word) * -1).take(2)

['Computation', 'Processing']

In [13]:
# Perform reduce operation
# Sum of all numbers from 1 to 20
# Sn = n*(n1 + nl) / 2 (Sum of A.P.)

print("Sum of numbers from 1 to 20 is: %d" % sc.parallelize(range(1, 21)).reduce(lambda x, y: x + y))

Sum of numbers from 1 to 20 is: 210


In [14]:
# COMMAND ----------

# Get the word with the highest length

def wordLengthReducer(leftWord, rightWord):
  if len(leftWord) > len(rightWord):
    return leftWord
  else:
    return rightWord

words.reduce(wordLengthReducer)

'Computation'

In [15]:
# COMMAND ----------

# Apply a function for every partition

def processingFunction(iter):
  result = []
  result.append("Starting the processing...")
  result += [(len(i), i) for i in iter]
  result.append("Finishing the processing!")
  return result 

transformedWords = words.mapPartitions(processingFunction)
for i in transformedWords.collect():
  print(i)


Starting the processing...
(5, 'Spark')
(7, 'Unified')
(11, 'Computation')
(6, 'Engine')
(1, ':')
Finishing the processing!
Starting the processing...
(3, 'Big')
(4, 'Data')
(10, 'Processing')
(4, 'Made')
(6, 'Simple')
Finishing the processing!


In [17]:
for i in words.collect():
    print(i)

Spark
Unified
Computation
Engine
:
Big
Data
Processing
Made
Simple


In [18]:
# Apply a function for every partition, using the information about the index of the partition

def indexedFunc(partitionIndex, withinPartitionIterator):
  return ["partition: {} => {}".format(partitionIndex, x) for x in withinPartitionIterator]

words.mapPartitionsWithIndex(indexedFunc).collect()

['partition: 0 => Spark',
 'partition: 0 => Unified',
 'partition: 0 => Computation',
 'partition: 0 => Engine',
 'partition: 0 => :',
 'partition: 1 => Big',
 'partition: 1 => Data',
 'partition: 1 => Processing',
 'partition: 1 => Made',
 'partition: 1 => Simple']

In [19]:
rdd = sc.parallelize([100, 2, 3, 3, 410, 3, 3, 3, 4, 104, 2])
print("Entire rdd: %s" % rdd.collect())
print("Number of elements in the rdd: %d" % rdd.count())
print("Distinct elements in the rdd: %s" % rdd.distinct().collect())
print("First element in the rdd: %d" % rdd.first())
print("Random two elements in the rdd: %s" % rdd.take(2))
print("Frequency of each element in the rdd: %s" % rdd.countByValue())
print("Maximum element in the rdd: %s" % rdd.max())
print("Minimum element in the rdd: %s" % rdd.min())
print("Bottom 2 elements in the rdd: %s" % rdd.takeOrdered(2))
print("Top 2 elements in the rdd: %s" % rdd.takeOrdered(2, key = lambda x: -x))

Entire rdd: [100, 2, 3, 3, 410, 3, 3, 3, 4, 104, 2]
Number of elements in the rdd: 11
Distinct elements in the rdd: [100, 2, 3, 410, 4, 104]
First element in the rdd: 100
Random two elements in the rdd: [100, 2]
Frequency of each element in the rdd: defaultdict(<class 'int'>, {100: 1, 2: 2, 3: 5, 410: 1, 4: 1, 104: 1})
Maximum element in the rdd: 410
Minimum element in the rdd: 2
Bottom 2 elements in the rdd: [2, 2]
Top 2 elements in the rdd: [410, 104]
