## Window Function

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import Window
import pyspark.sql.types as T

import findspark
findspark.init()

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
df = spark.read.csv('iris_csv.csv', header=True, inferSchema=True)

In [4]:
df.show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|
|         4.9|        3.0|         1.4|        0.2| Setosa|
|         4.7|        3.2|         1.3|        0.2| Setosa|
|         4.6|        3.1|         1.5|        0.2| Setosa|
|         5.0|        3.6|         1.4|        0.2| Setosa|
|         5.4|        3.9|         1.7|        0.4| Setosa|
|         4.6|        3.4|         1.4|        0.3| Setosa|
|         5.0|        3.4|         1.5|        0.2| Setosa|
|         4.4|        2.9|         1.4|        0.2| Setosa|
|         4.9|        3.1|         1.5|        0.1| Setosa|
|         5.4|        3.7|         1.5|        0.2| Setosa|
|         4.8|        3.4|         1.6|        0.2| Setosa|
|         4.8|        3.0|         1.4|        0.1| Setosa|
|         4.3|        3.0|         1.1| 

In [5]:
df.count()

150

In [6]:
windowSpec = Window.partitionBy("variety").orderBy(F.asc("sepal_length"))

data = df.withColumn("average_sepal_length", F.collect_list(F.col("sepal_length")).over(windowSpec)) \
.withColumn("average_sepal_width", F.avg(F.col("sepal_width")).over(windowSpec)) \
.withColumn("average_petal_length", F.avg(F.col("petal_length")).over(windowSpec)) 

In [7]:
data.show()

+------------+-----------+------------+-----------+-------+--------------------+-------------------+--------------------+
|sepal_length|sepal_width|petal_length|petal_width|variety|average_sepal_length|average_sepal_width|average_petal_length|
+------------+-----------+------------+-----------+-------+--------------------+-------------------+--------------------+
|         4.3|        3.0|         1.1|        0.1| Setosa|               [4.3]|                3.0|                 1.1|
|         4.4|        2.9|         1.4|        0.2| Setosa|[4.3, 4.4, 4.4, 4.4]| 3.0250000000000004|               1.275|
|         4.4|        3.0|         1.3|        0.2| Setosa|[4.3, 4.4, 4.4, 4.4]| 3.0250000000000004|               1.275|
|         4.4|        3.2|         1.3|        0.2| Setosa|[4.3, 4.4, 4.4, 4.4]| 3.0250000000000004|               1.275|
|         4.5|        2.3|         1.3|        0.3| Setosa|[4.3, 4.4, 4.4, 4...| 2.8800000000000003|  1.2799999999999998|
|         4.6|        3.

In [8]:
data.count()

150

In [9]:
df.select("variety").distinct().show()

+----------+
|   variety|
+----------+
| Virginica|
|    Setosa|
|Versicolor|
+----------+



In [10]:
data1 = df.groupBy("variety").agg(
F.expr("collect_list(sepal_length)").alias("sepal_length"),
F.expr("avg(sepal_length)").alias("average_sepal_length"))

In [11]:
data1.show()

+----------+--------------------+--------------------+
|   variety|        sepal_length|average_sepal_length|
+----------+--------------------+--------------------+
| Virginica|[6.3, 5.8, 7.1, 6...|   6.587999999999998|
|    Setosa|[5.1, 4.9, 4.7, 4...|   5.005999999999999|
|Versicolor|[7.0, 6.4, 6.9, 5...|               5.936|
+----------+--------------------+--------------------+

