In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

In [2]:
data = [
    "Project Gutenberg’s", "Alice’s Adventures in Wonderland",
    "Project Gutenberg’s", "Adventures in Wonderland", "Project Gutenberg’s"
]
rdd = spark.sparkContext.parallelize(data)

for element in rdd.collect():
    print(element)

Project Gutenberg’s
Alice’s Adventures in Wonderland
Project Gutenberg’s
Adventures in Wonderland
Project Gutenberg’s


# Flatmap

In [3]:
rdd2 = rdd.flatMap(lambda x: x.split(" "))
for element in rdd2.collect():
    print(element)

Project
Gutenberg’s
Alice’s
Adventures
in
Wonderland
Project
Gutenberg’s
Adventures
in
Wonderland
Project
Gutenberg’s


# map

In [4]:
rdd3 = rdd2.map(lambda x: (x, 1))
for element in rdd3.collect():
    print(element)

('Project', 1)
('Gutenberg’s', 1)
('Alice’s', 1)
('Adventures', 1)
('in', 1)
('Wonderland', 1)
('Project', 1)
('Gutenberg’s', 1)
('Adventures', 1)
('in', 1)
('Wonderland', 1)
('Project', 1)
('Gutenberg’s', 1)


# reduceByKey

In [5]:
rdd4 = rdd3.reduceByKey(lambda a, b: a + b)
for element in rdd4.collect():
    print(element)

('Gutenberg’s', 3)
('Adventures', 2)
('Wonderland', 2)
('Alice’s', 1)
('in', 2)
('Project', 3)


# map

In [6]:
rdd5 = rdd4.map(lambda x: (x[1], x[0])).sortByKey()
for element in rdd5.collect():
    print(element)

(1, 'Alice’s')
(2, 'Adventures')
(2, 'Wonderland')
(2, 'in')
(3, 'Gutenberg’s')
(3, 'Project')


# filter

In [7]:
rdd6 = rdd5.filter(lambda x: 'a' in x[1])
for element in rdd6.collect():
    print(element)

(2, 'Wonderland')


In [8]:
from pyspark.sql.functions import col, expr

data = [("2019-01-23", 1), ("2019-06-24", 2), ("2019-09-20", 3)]
spark.createDataFrame(data).toDF("date","increment") \
    .select(col("date"),col("increment"), \
      expr("add_months(to_date(date,'yyyy-MM-dd'),cast(increment as int))").alias("inc_date")) \
    .show()

+----------+---------+----------+
|      date|increment|  inc_date|
+----------+---------+----------+
|2019-01-23|        1|2019-02-23|
|2019-06-24|        2|2019-08-24|
|2019-09-20|        3|2019-12-20|
+----------+---------+----------+

