In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = (SparkSession
         .builder
         .appName("higher-order function")
         .getOrCreate())

23/05/15 12:50:10 WARN Utils: Your hostname, wedivv-H110M-S2V resolves to a loopback address: 127.0.1.1; using 192.168.1.44 instead (on interface wlp5s0)
23/05/15 12:50:10 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/15 12:50:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from pyspark.sql.types import *

schema = StructType([StructField("celsius", ArrayType(IntegerType()))])

In [4]:
t_list = [[35, 36, 32, 30, 40, 42, 38]], [[31, 32, 34, 55, 56]]
t_c = spark.createDataFrame(t_list, schema)
t_c.createOrReplaceTempView("tC")

In [5]:
t_c.show()

                                                                                

+--------------------+
|             celsius|
+--------------------+
|[35, 36, 32, 30, ...|
|[31, 32, 34, 55, 56]|
+--------------------+



In [6]:
spark.sql("""
    SELECT celsius, 
            transform(celsius, t -> ((t * 9) div 5) + 32) as fahrenheit
    FROM tC
""").show()

+--------------------+--------------------+
|             celsius|          fahrenheit|
+--------------------+--------------------+
|[35, 36, 32, 30, ...|[95, 96, 89, 86, ...|
|[31, 32, 34, 55, 56]|[87, 89, 93, 131,...|
+--------------------+--------------------+



In [7]:
spark.sql("""
    SELECT celsius,
            filter(celsius, t -> t > 38) as high
    FROM tC
""").show()

+--------------------+--------+
|             celsius|    high|
+--------------------+--------+
|[35, 36, 32, 30, ...|[40, 42]|
|[31, 32, 34, 55, 56]|[55, 56]|
+--------------------+--------+



In [8]:
spark.sql("""
    SELECT celsius,
            exists(celsius, t -> t > 44) as threshold
    FROM tC
""").show()

+--------------------+---------+
|             celsius|threshold|
+--------------------+---------+
|[35, 36, 32, 30, ...|    false|
|[31, 32, 34, 55, 56]|     true|
+--------------------+---------+



In [9]:
spark.sql("""
    SELECT celsius,
            reduce(
                celsius,                      -- First argument: the input collection
                0,                            -- Second argument: the initial value of the accumulator
                (t, acc) -> t + acc,          -- Third argument: the lambda function that iteratively sums up the Celsius temperatures
                acc -> (acc div size(celsius) * 9 div 5) + 32    -- Fourth argument: the lambda function that converts the final accumulator value to Fahrenheit
                ) as avgFahrenheit
    FROM tC
""").show()


+--------------------+-------------+
|             celsius|avgFahrenheit|
+--------------------+-------------+
|[35, 36, 32, 30, ...|           96|
|[31, 32, 34, 55, 56]|          105|
+--------------------+-------------+

