In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import array_contains, col, array, array_contains, explode
from pyspark.sql.types import StringType, ArrayType, StructType, StructField, FloatType
from pyspark.sql import functions as F
from functools import reduce

spark = (
    SparkSession.builder
    .appName("example")
    .getOrCreate()
)

In [3]:
arrayCol = ArrayType(StringType(),False)

data = [("James,,Smith", ["Java","Scala","C++"], ["Spark","Java"], "OH", "CA", [1.0,2.1,3.2,4.3,5.4]),
        ("Michael,Rose,", ["Spark","Java","C++"], ["Spark","Java"], "NY", "NJ",[2.0,3.1,4.2,5.3,6.4]),
        ("Robert,,Williams", ["CSharp","VB"], ["Spark","Python"], "UT", "NV",[3.0,4.1,5.2,6.3,7.4])]

schema = StructType([ 
    StructField("name",StringType(),True), 
    StructField("languagesAtSchool",ArrayType(StringType()),True), 
    StructField("languagesAtWork",ArrayType(StringType()),True), 
    StructField("currentState", StringType(), True), 
    StructField("previousState", StringType(), True),
    StructField("grade",ArrayType(FloatType()),True), 
  ])

df = spark.createDataFrame(data=data, schema=schema)
df.printSchema()
df.show()

root
 |-- name: string (nullable = true)
 |-- languagesAtSchool: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- languagesAtWork: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- currentState: string (nullable = true)
 |-- previousState: string (nullable = true)
 |-- grade: array (nullable = true)
 |    |-- element: float (containsNull = true)

+----------------+------------------+---------------+------------+-------------+--------------------+
|            name| languagesAtSchool|languagesAtWork|currentState|previousState|               grade|
+----------------+------------------+---------------+------------+-------------+--------------------+
|    James,,Smith|[Java, Scala, C++]|  [Spark, Java]|          OH|           CA|[1.0, 2.1, 3.2, 4...|
|   Michael,Rose,|[Spark, Java, C++]|  [Spark, Java]|          NY|           NJ|[2.0, 3.1, 4.2, 5...|
|Robert,,Williams|      [CSharp, VB]|[Spark, Python]|          UT|           NV|[3.0, 

In [6]:
import numpy as np
from pyspark.sql.functions import udf

array_mean = udf(lambda x: float(np.mean(x)), FloatType())
df.select(array_mean("grade").alias("avg")).show()

+---+
|avg|
+---+
|3.2|
|4.2|
|5.2|
+---+



In [4]:
from pyspark.sql.functions import expr

query = """aggregate(`{col}`, CAST(0.0 AS double), (acc, x) -> acc + x, acc -> acc / size(`{col}`)) AS  `avg_{col}`""".format(col="grade")

df.selectExpr("*", query).show()

+----------------+------------------+---------------+------------+-------------+--------------------+-----------------+
|            name| languagesAtSchool|languagesAtWork|currentState|previousState|               grade|        avg_grade|
+----------------+------------------+---------------+------------+-------------+--------------------+-----------------+
|    James,,Smith|[Java, Scala, C++]|  [Spark, Java]|          OH|           CA|[1.0, 2.1, 3.2, 4...|3.200000047683716|
|   Michael,Rose,|[Spark, Java, C++]|  [Spark, Java]|          NY|           NJ|[2.0, 3.1, 4.2, 5...|              4.2|
|Robert,,Williams|      [CSharp, VB]|[Spark, Python]|          UT|           NV|[3.0, 4.1, 5.2, 6...|              5.2|
+----------------+------------------+---------------+------------+-------------+--------------------+-----------------+



In [None]:
query_null = expr("""aggregate(
                    CAST(array(_1, _2, _3) AS array<double>), 
                    CAST((0.0 as sum, 0.0 as n) AS struct<sum: double, n: double>), 
                    (acc, x) -> (
                        acc.sum + coalesce(x, 0.0), 
                        acc.n + CASE WHEN x IS NULL THEN 0.0 ELSE 1.0 END), 
                    acc -> acc.sum / acc.n)""")