In [1]:
import os, sys
os.environ['HADOOP_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['YARN_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['PYSPARK_PYTHON'] =  'python3.9'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'python3.9'
os.environ['HADOOP_USER_NAME']='ssenigov'

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

In [4]:
conf = SparkConf().setAppName('ExplodeArray').setMaster('yarn')\
   .set('spark.sql.legacy.timeParserPolicy', 'LEGACY')
spark = SparkSession.builder.config(conf=conf).getOrCreate()

sc = spark.sparkContext
print("app_id".ljust(40), sc.applicationId)

app_id                                   application_1727681258360_0010


In [5]:
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, IntegerType, FloatType, DoubleType, DateType, LongType
from pyspark.sql.functions import explode, col, concat, date_format, to_date
 
schema = StructType([StructField('event_name', StringType(), False), StructField('year', LongType(), False), \
                     StructField('month', LongType(), False), StructField('event_days', ArrayType(LongType()), False),])
 
df_calendar = spark.createDataFrame( [('Meeting Project #1', 2024, 10, [1, 8, 22],), ('Meeting Project #2', 2024, 11, [4, 12, 25, 26],)], schema)
df_calendar.printSchema()
df_calendar.show()
 
df_event_dates = df_calendar.select(col('event_name'), col('year'), col('month'), explode(col('event_days')).alias('day'))\
   .withColumn('str_date', concat(col('year'), col('month'), col('day')))\
   .withColumn('event_date', to_date(col('str_date'), 'yyyyMMdd'))\
   .withColumn('week_day', date_format(col('event_date'), 'EEEE').alias('week_day'))\
   .drop('str_date')
df_event_dates.show()

root
 |-- event_name: string (nullable = false)
 |-- year: long (nullable = false)
 |-- month: long (nullable = false)
 |-- event_days: array (nullable = false)
 |    |-- element: long (containsNull = true)



                                                                                

+------------------+----+-----+---------------+
|        event_name|year|month|     event_days|
+------------------+----+-----+---------------+
|Meeting Project #1|2024|   10|     [1, 8, 22]|
|Meeting Project #2|2024|   11|[4, 12, 25, 26]|
+------------------+----+-----+---------------+

+------------------+----+-----+---+----------+--------+
|        event_name|year|month|day|event_date|week_day|
+------------------+----+-----+---+----------+--------+
|Meeting Project #1|2024|   10|  1|2024-10-01| Tuesday|
|Meeting Project #1|2024|   10|  8|2024-10-08| Tuesday|
|Meeting Project #1|2024|   10| 22|2024-10-22| Tuesday|
|Meeting Project #2|2024|   11|  4|2024-11-04|  Monday|
|Meeting Project #2|2024|   11| 12|2024-11-12| Tuesday|
|Meeting Project #2|2024|   11| 25|2024-11-25|  Monday|
|Meeting Project #2|2024|   11| 26|2024-11-26| Tuesday|
+------------------+----+-----+---+----------+--------+



In [None]:
spark.stop()