In [1]:
import os, sys
os.environ['HADOOP_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['YARN_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['PYSPARK_PYTHON'] =  'python3.9'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'python3.9'
os.environ['HADOOP_USER_NAME']='ssenigov'

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

In [2]:
conf = SparkConf().setAppName('Pivoting').setMaster('yarn') 
spark = SparkSession.builder.config(conf=conf).getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/09 13:10:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/02/09 13:10:52 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/02/09 13:10:52 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/02/09 13:10:53 WARN DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.
25/02/09 13:10:53 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [None]:
# original:
+----+---+-----+
|year|qrt|  num|
+----+---+-----+
|2022|  2|    3|
|2022|  3|    5|
|2022|  3|    1|
|2021|  3|    3|
|2021|  1|    2|
+----+---+-----+

# pivoted:
+----+------------+------------+------------+------------+
|year|qrt1_sum_num|qrt2_sum_num|qrt3_sum_num|qrt4_sum_num|
+----+------------+------------+------------+------------+
|2022|        null|           3|           6|        null|
|2021|           2|        null|           3|        null|
+----+------------+------------+------------+------------+

In [22]:
from pyspark.sql.functions import round, rand, lit
from pyspark.sql.types import IntegerType

# create sample DataFrame and rename the generated column "id" to "num"
df = spark.range(50).withColumnRenamed('id', 'num') 
df = df.withColumn('year', lit(2020) + (3*rand()).cast(IntegerType()))
df = df.withColumn('qrt', (4*rand()).cast(IntegerType()) % 4 + 1)

df.createOrReplaceTempView("tbl")  # create table to address it in sql
df.show()

+---+----+---+
|num|year|qrt|
+---+----+---+
|  0|2020|  2|
|  1|2022|  3|
|  2|2022|  4|
|  3|2020|  1|
|  4|2022|  4|
|  5|2022|  4|
|  6|2020|  3|
|  7|2020|  1|
|  8|2021|  2|
|  9|2020|  2|
| 10|2022|  1|
| 11|2021|  2|
| 12|2022|  2|
| 13|2021|  2|
| 14|2022|  3|
| 15|2022|  2|
| 16|2020|  2|
| 17|2021|  4|
| 18|2021|  1|
| 19|2022|  2|
+---+----+---+
only showing top 20 rows



In [7]:
sql = """
select year    
, sum(case when qrt = 1 then id else null end) as qrt1_sum_id
, sum(case when qrt = 2 then id else null end) as qrt2_sum_id
, sum(case when qrt = 3 then id else null end) as qrt3_sum_id
, sum(case when qrt = 4 then id else null end) as qrt4_sum_id
from tbl
group by year 
order by year desc
"""
spark.sql(sql).show(n=50, truncate=False)

                                                                                

+----+-----------+-----------+-----------+-----------+
|year|qrt1_sum_id|qrt2_sum_id|qrt3_sum_id|qrt4_sum_id|
+----+-----------+-----------+-----------+-----------+
|2022|140        |48         |177        |104        |
|2021|44         |151        |111        |14         |
|2020|70         |87         |51         |228        |
+----+-----------+-----------+-----------+-----------+



In [20]:
sql = """
select year, qrt1, qrt2, qrt3, qrt4, qrt5 
 from tbl 
pivot (
  count(num) as id_cnt
  for qrt in (1 as qrt1, 2 as qrt2, 3 as qrt3, 4 as qrt4, 5 as qrt5)
)
order by year """

spark.sql(sql).show()

+----+----+----+----+----+----+
|year|qrt1|qrt2|qrt3|qrt4|qrt5|
+----+----+----+----+----+----+
|2020|   2|   2|   6|   1|null|
|2021|   5|   4|   5|   6|null|
|2022|   8|   3|   4|   4|null|
+----+----+----+----+----+----+



In [36]:
sql = """
select year, qrt1_cnt, qrt1_sum, qrt2_cnt, qrt2_sum, 
             qrt3_cnt, qrt3_sum, qrt4_cnt, qrt4_sum
       -- or we can put here "*"       
from tbl 
pivot (
  count(1) as cnt,
  sum(num) as sum
  for qrt in (1 as qrt1, 2 as qrt2, 3 as qrt3, 4 as qrt4, 5 as qrt5)
)
order by year """
spark.sql(sql).show()

+----+--------+--------+--------+--------+--------+--------+--------+--------+
|year|qrt1_cnt|qrt1_sum|qrt2_cnt|qrt2_sum|qrt3_cnt|qrt3_sum|qrt4_cnt|qrt4_sum|
+----+--------+--------+--------+--------+--------+--------+--------+--------+
|2020|       3|      31|       5|     105|       4|     101|       1|      35|
|2021|       4|     127|       4|      54|       4|     151|       3|      94|
|2022|       5|     159|       7|     156|       4|      86|       6|     126|
+----+--------+--------+--------+--------+--------+--------+--------+--------+



In [62]:
spark.stop()