In [1]:
# Using spark context :
# from pyspark import SparkContext, SparkConf
# conf = SparkConf().setAppName('temp')
# sc = SparkContext(conf=conf)
# print(sc)
# data = [1, 2, 3, 4, 5]
# distData = sc.parallelize(data)
# distData.collect()

In [2]:
# Using spark sessiion :
from pyspark.sql import SparkSession
import pyspark.sql.types as T
import pyspark.sql.functions as F
from pyspark.sql import Window

In [3]:
spark = SparkSession.builder.appName("temp").getOrCreate()
spark

In [4]:
l = [
    (1, "sales", 4200),
    (2, "admin", 3100),
    (3, "sales", 4000),
    (4, "sales", 4000),
    (5, "admin", 2700),
    (6, "dev", 3400),
    (7, "dev", 5200),
    (8, "dev", 3700),
    (9, "dev", 4400),
    (10, "dev", 4400)    
]
df = spark.createDataFrame(l, schema=['id', 'dept', 'salary'])
df.show()

+---+-----+------+
| id| dept|salary|
+---+-----+------+
|  1|sales|  4200|
|  2|admin|  3100|
|  3|sales|  4000|
|  4|sales|  4000|
|  5|admin|  2700|
|  6|  dev|  3400|
|  7|  dev|  5200|
|  8|  dev|  3700|
|  9|  dev|  4400|
| 10|  dev|  4400|
+---+-----+------+



In [5]:
# df.collect() is returning a list
df.collect()[3]['dept']
# or 
# df.collect()[3][1]

'sales'

In [6]:
df_group = df.groupBy("dept").agg(F.expr('avg(salary)'), 
                                  F.expr('sum(salary)'),
                                  F.expr('collect_list(salary)').alias("list of salary"))

In [7]:
df_group.show()

+-----+------------------+-----------+--------------------+
| dept|       avg(salary)|sum(salary)|      list of salary|
+-----+------------------+-----------+--------------------+
|  dev|            4220.0|      21100|[3400, 5200, 3700...|
|sales|4066.6666666666665|      12200|  [4200, 4000, 4000]|
|admin|            2900.0|       5800|        [3100, 2700]|
+-----+------------------+-----------+--------------------+



In [8]:
help(Window.partitionBy)

Help on function partitionBy in module pyspark.sql.window:

partitionBy(*cols)
    Creates a :class:`WindowSpec` with the partitioning defined.
    
    .. versionadded:: 1.4



In [9]:
windowSpec = Window.partitionBy('dept') # specify the partition window
df.withColumn("max_salary", F.max(F.col("salary")).over(windowSpec)).show()

+---+-----+------+----------+
| id| dept|salary|max_salary|
+---+-----+------+----------+
|  6|  dev|  3400|      5200|
|  7|  dev|  5200|      5200|
|  8|  dev|  3700|      5200|
|  9|  dev|  4400|      5200|
| 10|  dev|  4400|      5200|
|  1|sales|  4200|      4200|
|  3|sales|  4000|      4200|
|  4|sales|  4000|      4200|
|  2|admin|  3100|      3100|
|  5|admin|  2700|      3100|
+---+-----+------+----------+



In [10]:
# partition by returns every row calculating over the window
df.withColumn("avg(salary)", F.avg(F.col("salary")).over(windowSpec)) \
  .withColumn("sum(salary)", F.sum(F.col("salary")).over(windowSpec)) \
  .withColumn("list of salary", F.collect_list("salary").over(windowSpec)) \
  .show()

+---+-----+------+------------------+-----------+--------------------+
| id| dept|salary|       avg(salary)|sum(salary)|      list of salary|
+---+-----+------+------------------+-----------+--------------------+
|  6|  dev|  3400|            4220.0|      21100|[3400, 5200, 3700...|
|  7|  dev|  5200|            4220.0|      21100|[3400, 5200, 3700...|
|  8|  dev|  3700|            4220.0|      21100|[3400, 5200, 3700...|
|  9|  dev|  4400|            4220.0|      21100|[3400, 5200, 3700...|
| 10|  dev|  4400|            4220.0|      21100|[3400, 5200, 3700...|
|  1|sales|  4200|4066.6666666666665|      12200|  [4200, 4000, 4000]|
|  3|sales|  4000|4066.6666666666665|      12200|  [4200, 4000, 4000]|
|  4|sales|  4000|4066.6666666666665|      12200|  [4200, 4000, 4000]|
|  2|admin|  3100|            2900.0|       5800|        [3100, 2700]|
|  5|admin|  2700|            2900.0|       5800|        [3100, 2700]|
+---+-----+------+------------------+-----------+--------------------+



In [11]:
windowSpec_new = Window.partitionBy('dept').orderBy(F.asc('salary')) # new partition window
df.withColumn("avg(salary)", F.avg(F.col("salary")).over(windowSpec_new)) \
  .withColumn("sum(salary)", F.sum(F.col("salary")).over(windowSpec_new)) \
  .withColumn("list of salary", F.collect_list("salary").over(windowSpec_new)) \
  .show()

+---+-----+------+------------------+-----------+--------------------+
| id| dept|salary|       avg(salary)|sum(salary)|      list of salary|
+---+-----+------+------------------+-----------+--------------------+
|  6|  dev|  3400|            3400.0|       3400|              [3400]|
|  8|  dev|  3700|            3550.0|       7100|        [3400, 3700]|
|  9|  dev|  4400|            3975.0|      15900|[3400, 3700, 4400...|
| 10|  dev|  4400|            3975.0|      15900|[3400, 3700, 4400...|
|  7|  dev|  5200|            4220.0|      21100|[3400, 3700, 4400...|
|  3|sales|  4000|            4000.0|       8000|        [4000, 4000]|
|  4|sales|  4000|            4000.0|       8000|        [4000, 4000]|
|  1|sales|  4200|4066.6666666666665|      12200|  [4000, 4000, 4200]|
|  5|admin|  2700|            2700.0|       2700|              [2700]|
|  2|admin|  3100|            2900.0|       5800|        [2700, 3100]|
+---+-----+------+------------------+-----------+--------------------+



In [12]:
help(windowSpec_new)

Help on WindowSpec in module pyspark.sql.window object:

class WindowSpec(builtins.object)
 |  WindowSpec(jspec)
 |  
 |  A window specification that defines the partitioning, ordering,
 |  and frame boundaries.
 |  
 |  Use the static methods in :class:`Window` to create a :class:`WindowSpec`.
 |  
 |  .. versionadded:: 1.4
 |  
 |  Methods defined here:
 |  
 |  __init__(self, jspec)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  orderBy(self, *cols)
 |      Defines the ordering columns in a :class:`WindowSpec`.
 |      
 |      :param cols: names of columns or expressions
 |      
 |      .. versionadded:: 1.4
 |  
 |  partitionBy(self, *cols)
 |      Defines the partitioning columns in a :class:`WindowSpec`.
 |      
 |      :param cols: names of columns or expressions
 |      
 |      .. versionadded:: 1.4
 |  
 |  rangeBetween(self, start, end)
 |      Defines the frame boundaries, from `start` (inclusive) to `end` (inclusive).
 |      
 |      Bo

In [13]:
# without taking duplicates into count
windowSpec_new_1 = (
    Window.partitionBy('dept')
          .orderBy(F.asc('salary'))
          .rowsBetween(Window.unboundedPreceding, Window.currentRow)
)

In [14]:
df.withColumn("avg(salary)", F.avg(F.col("salary")).over(windowSpec_new_1)) \
  .withColumn("sum(salary)", F.sum(F.col("salary")).over(windowSpec_new_1)) \
  .withColumn("list of salary", F.collect_list("salary").over(windowSpec_new_1)) \
  .show()

+---+-----+------+------------------+-----------+--------------------+
| id| dept|salary|       avg(salary)|sum(salary)|      list of salary|
+---+-----+------+------------------+-----------+--------------------+
|  6|  dev|  3400|            3400.0|       3400|              [3400]|
|  8|  dev|  3700|            3550.0|       7100|        [3400, 3700]|
|  9|  dev|  4400|3833.3333333333335|      11500|  [3400, 3700, 4400]|
| 10|  dev|  4400|            3975.0|      15900|[3400, 3700, 4400...|
|  7|  dev|  5200|            4220.0|      21100|[3400, 3700, 4400...|
|  3|sales|  4000|            4000.0|       4000|              [4000]|
|  4|sales|  4000|            4000.0|       8000|        [4000, 4000]|
|  1|sales|  4200|4066.6666666666665|      12200|  [4000, 4000, 4200]|
|  5|admin|  2700|            2700.0|       2700|              [2700]|
|  2|admin|  3100|            2900.0|       5800|        [2700, 3100]|
+---+-----+------+------------------+-----------+--------------------+



In [15]:
F.avg(F.col("salary")).over(windowSpec_new_1)

Column<b'avg(salary) OVER (PARTITION BY dept ORDER BY salary ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)'>

In [16]:
# without taking duplicates into count
windowSpec_new_2 = (
    Window.partitionBy('dept')
          .orderBy(F.asc('salary'))
          .rowsBetween(-1, Window.currentRow) # (start, end)
)
F.avg(F.col("salary")).over(windowSpec_new_2)

Column<b'avg(salary) OVER (PARTITION BY dept ORDER BY salary ASC NULLS FIRST ROWS BETWEEN -1 FOLLOWING AND CURRENT ROW)'>

In [17]:
df.withColumn("avg(salary)", F.avg(F.col("salary")).over(windowSpec_new_2)) \
  .withColumn("sum(salary)", F.sum(F.col("salary")).over(windowSpec_new_2)) \
  .withColumn("list of salary", F.collect_list("salary").over(windowSpec_new_2)) \
  .show()

+---+-----+------+-----------+-----------+--------------+
| id| dept|salary|avg(salary)|sum(salary)|list of salary|
+---+-----+------+-----------+-----------+--------------+
|  6|  dev|  3400|     3400.0|       3400|        [3400]|
|  8|  dev|  3700|     3550.0|       7100|  [3400, 3700]|
|  9|  dev|  4400|     4050.0|       8100|  [3700, 4400]|
| 10|  dev|  4400|     4400.0|       8800|  [4400, 4400]|
|  7|  dev|  5200|     4800.0|       9600|  [4400, 5200]|
|  3|sales|  4000|     4000.0|       4000|        [4000]|
|  4|sales|  4000|     4000.0|       8000|  [4000, 4000]|
|  1|sales|  4200|     4100.0|       8200|  [4000, 4200]|
|  5|admin|  2700|     2700.0|       2700|        [2700]|
|  2|admin|  3100|     2900.0|       5800|  [2700, 3100]|
+---+-----+------+-----------+-----------+--------------+



In [18]:
df_new =  df.withColumn("average_salary", F.avg(F.col('salary')).over(windowSpec_new)) \
            .withColumn("total_salary", F.sum(F.col('salary')).over(windowSpec_new)) \
            .withColumn("rank", F.rank().over(windowSpec_new)) \
            .withColumn("dense_rank", F.dense_rank().over(windowSpec_new)) \
            .withColumn("perc_rank", F.percent_rank().over(windowSpec_new)) 
df_new.show()

+---+-----+------+------------------+------------+----+----------+---------+
| id| dept|salary|    average_salary|total_salary|rank|dense_rank|perc_rank|
+---+-----+------+------------------+------------+----+----------+---------+
|  6|  dev|  3400|            3400.0|        3400|   1|         1|      0.0|
|  8|  dev|  3700|            3550.0|        7100|   2|         2|     0.25|
|  9|  dev|  4400|            3975.0|       15900|   3|         3|      0.5|
| 10|  dev|  4400|            3975.0|       15900|   3|         3|      0.5|
|  7|  dev|  5200|            4220.0|       21100|   5|         4|      1.0|
|  3|sales|  4000|            4000.0|        8000|   1|         1|      0.0|
|  4|sales|  4000|            4000.0|        8000|   1|         1|      0.0|
|  1|sales|  4200|4066.6666666666665|       12200|   3|         2|      1.0|
|  5|admin|  2700|            2700.0|        2700|   1|         1|      0.0|
|  2|admin|  3100|            2900.0|        5800|   2|         2|      1.0|

In [19]:
# pivot and unpivot
help(df.groupBy().pivot)

Help on method pivot in module pyspark.sql.group:

pivot(pivot_col, values=None) method of pyspark.sql.group.GroupedData instance
    Pivots a column of the current :class:`DataFrame` and perform the specified aggregation.
    There are two versions of pivot function: one that requires the caller to specify the list
    of distinct values to pivot on, and one that does not. The latter is more concise but less
    efficient, because Spark needs to first compute the list of distinct values internally.
    
    :param pivot_col: Name of the column to pivot.
    :param values: List of values that will be translated to columns in the output DataFrame.
    
    # Compute the sum of earnings for each year by course with each course as a separate column
    
    >>> df4.groupBy("year").pivot("course", ["dotNET", "Java"]).sum("earnings").collect()
    [Row(year=2012, dotNET=15000, Java=20000), Row(year=2013, dotNET=48000, Java=30000)]
    
    # Or without specifying column values (less efficie

In [20]:
df_new.groupBy("rank").pivot("dept").sum("salary").orderBy(F.asc('rank')).show()

+----+-----+----+-----+
|rank|admin| dev|sales|
+----+-----+----+-----+
|   1| 2700|3400| 8000|
|   2| 3100|3700| null|
|   3| null|8800| 4200|
|   5| null|5200| null|
+----+-----+----+-----+



In [21]:
df_new.groupBy("rank").pivot("dept").agg(
    F.sum("salary").alias("sum_salary"),
    F.avg("average_salary")) \
    .show()

+----+----------------+-------------------------+--------------+-----------------------+----------------+-------------------------+
|rank|admin_sum_salary|admin_avg(average_salary)|dev_sum_salary|dev_avg(average_salary)|sales_sum_salary|sales_avg(average_salary)|
+----+----------------+-------------------------+--------------+-----------------------+----------------+-------------------------+
|   1|            2700|                   2700.0|          3400|                 3400.0|            8000|                   4000.0|
|   3|            null|                     null|          8800|                 3975.0|            4200|       4066.6666666666665|
|   5|            null|                     null|          5200|                 4220.0|            null|                     null|
|   2|            3100|                   2900.0|          3700|                 3550.0|            null|                     null|
+----+----------------+-------------------------+--------------+------------

In [22]:
# another example
df_exp = spark.createDataFrame([("G","X", 1),
                                ("G","Y", 2),
                                ("G","X", 3),
                                ("H","Y", 4),
                                ("H","Z", 5),
                                ],list("ABC"))
df_exp.show()

+---+---+---+
|  A|  B|  C|
+---+---+---+
|  G|  X|  1|
|  G|  Y|  2|
|  G|  X|  3|
|  H|  Y|  4|
|  H|  Z|  5|
+---+---+---+



In [23]:
# group by a column (A), transpose another column to row name 
df_exp_pivot = df_exp.groupBy("A").pivot("B").sum("C")
df_exp_pivot.show()

+---+----+---+----+
|  A|   X|  Y|   Z|
+---+----+---+----+
|  G|   4|  2|null|
|  H|null|  4|   5|
+---+----+---+----+



In [24]:
df_exp_unpivot = df_exp_pivot.selectExpr("A", "stack(3, 'X', X, 'Y', Y, 'Z', Z) as (B, C)").where("C is not null")
df_exp_unpivot.show()

+---+---+---+
|  A|  B|  C|
+---+---+---+
|  G|  X|  4|
|  G|  Y|  2|
|  H|  Y|  4|
|  H|  Z|  5|
+---+---+---+



In [29]:
df_exp_unpivot.selectExpr().show()

++
||
++
||
||
||
||
++



In [26]:
help(df.selectExpr)

Help on method selectExpr in module pyspark.sql.dataframe:

selectExpr(*expr) method of pyspark.sql.dataframe.DataFrame instance
    Projects a set of SQL expressions and returns a new :class:`DataFrame`.
    
    This is a variant of :func:`select` that accepts SQL expressions.
    
    >>> df.selectExpr("age * 2", "abs(age)").collect()
    [Row((age * 2)=4, abs(age)=2), Row((age * 2)=10, abs(age)=5)]
    
    .. versionadded:: 1.3



In [27]:
df_exp.selectExpr("stack(2, 1, 2, 3, 4)").show()

+----+----+
|col0|col1|
+----+----+
|   1|   2|
|   3|   4|
|   1|   2|
|   3|   4|
|   1|   2|
|   3|   4|
|   1|   2|
|   3|   4|
|   1|   2|
|   3|   4|
+----+----+

