In [1]:
import os, sys
os.environ['HADOOP_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['YARN_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['PYSPARK_PYTHON'] =  'python3.9'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'python3.9'
os.environ['HADOOP_USER_NAME']='ssenigov'

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

In [None]:
conf = (SparkConf().setAppName('lateral').setMaster('yarn') 
    .set('spark.sql.adaptive.enabled', False)  )
spark = SparkSession.builder.config(conf=conf).getOrCreate()
print('app_id'.ljust(40), spark.sparkContext.applicationId)

In [3]:
sql = """
   select emp_id, emp_name, middle_name
    from values (1, 'Anna', 'E.'), 
                (2, 'Bob', 'C.C.'), 
                (3, 'Clara', 'R.')
      as (emp_id, emp_name, middle_name) """

df_emps = spark.sql(sql)
spark.sql(" select * from {df} ", df=df_emps).show()

+------+--------+-----------+
|emp_id|emp_name|middle_name|
+------+--------+-----------+
|     1|    Anna|         E.|
|     2|     Bob|       C.C.|
|     3|   Clara|         R.|
+------+--------+-----------+



In [4]:
sql = """
   select emp_id, pos_name, ord
    from values
       (1, 'Jr Manager',  1), 
       (1, 'Mid Manager', 2), -- Anna
       
       (2, 'Assist',      1), 
       (2, 'Jr Sales',    2), 
       (2, 'Sales Mngr',  3), -- Bob
       
       (3, 'Office Mngr', 1) -- Clara
          as (emp_id, pos_name, ord) """ 
df_pos = spark.sql(sql)

spark.sql(" select * from {df} ", df=df_pos).show()

+------+-----------+---+
|emp_id|   pos_name|ord|
+------+-----------+---+
|     1| Jr Manager|  1|
|     1|Mid Manager|  2|
|     2|     Assist|  1|
|     2|   Jr Sales|  2|
|     2| Sales Mngr|  3|
|     3|Office Mngr|  1|
+------+-----------+---+



In [5]:
# without lateral join  
sql = """
   select 
     emp_name, 
     concat_ws(' ', upper(emp_name), middle_name) full_name
    from {emps} 
   order by emp_id
  """
spark.sql(sql, emps=df_emps).show(truncate=False)

                                                                                

+--------+---------+
|emp_name|full_name|
+--------+---------+
|Anna    |ANNA E.  |
|Bob     |BOB C.C. |
|Clara   |CLARA R. |
+--------+---------+



In [6]:
# with lateral join  
sql = """
   select emp_name, full_name
    from {emps},
      lateral 
         (select concat_ws(' ', upper(emp_name), middle_name) full_name ) 
   order by emp_id
  """
spark.sql(sql, emps=df_emps).show(truncate=False)

+--------+---------+
|emp_name|full_name|
+--------+---------+
|Anna    |ANNA E.  |
|Bob     |BOB C.C. |
|Clara   |CLARA R. |
+--------+---------+



In [7]:
# with lateral join  
sql = """
   select emp_id, emp_name, first_pos, pos_list, current_pos
    from {emps} e,
     lateral (select p.ord, p.pos_name first_pos
               from {pos} p 
              where p.emp_id = e.emp_id 
              and p.ord = 1),               -- first_position
     lateral (select collect_list(pos_name) pos_list
               from {pos} p 
              where p.emp_id = e.emp_id 
              group by p.emp_id),         -- all positions list
     lateral (select max(ord) maxord
               from {pos} p 
              where p.emp_id = e.emp_id), -- current position index
     lateral (select concat_ws('. ', p.ord, p.pos_name) current_pos
               from {pos} p 
              where p.emp_id = e.emp_id
              and p.ord = maxord)      -- current position  """

spark.sql(sql, emps=df_emps, pos=df_pos).show(truncate=False)

                                                                                

+------+--------+-----------+------------------------------+--------------+
|emp_id|emp_name|first_pos  |pos_list                      |current_pos   |
+------+--------+-----------+------------------------------+--------------+
|1     |Anna    |Jr Manager |[Jr Manager, Mid Manager]     |2. Mid Manager|
|2     |Bob     |Assist     |[Assist, Jr Sales, Sales Mngr]|3. Sales Mngr |
|3     |Clara   |Office Mngr|[Office Mngr]                 |1. Office Mngr|
+------+--------+-----------+------------------------------+--------------+



In [8]:
spark.stop()