In [1]:
import pyspark.sql as ps
from pyspark.sql import Row
# import pyspark.types as T # <= wrong one
import pyspark.sql.types as T
import pyspark.sql.functions as F

In [2]:
spark = ps.SparkSession.builder.appName("TimeIntelli").getOrCreate()
spark

In [3]:
rows = [
    Row('2020-01-03'),
    Row('2020 01 13'),
    Row('2020 Jan 18')
]
myrdd = spark.sparkContext.parallelize(rows)

In [4]:
myrdd.collect()

[<Row('2020-01-03')>, <Row('2020 01 13')>, <Row('2020 Jan 18')>]

In [5]:
mySchema = T.StructType(
    [T.StructField(name="date_str", dataType=T.StringType(), nullable=True)]
)

In [6]:
df = spark.createDataFrame(myrdd, schema= mySchema)
df.show()

+-----------+
|   date_str|
+-----------+
| 2020-01-03|
| 2020 01 13|
|2020 Jan 18|
+-----------+



In [7]:
df.printSchema()

root
 |-- date_str: string (nullable = true)



In [8]:
x = F.when(F.to_date(F.col('date_str'), "yyyy-MM-dd").isNotNull(), F.to_date(F.col('date_str'), "yyyy-MM-dd"))
x

Column<b"CASE WHEN (to_date(`date_str`, 'yyyy-MM-dd') IS NOT NULL) THEN to_date(`date_str`, 'yyyy-MM-dd') END">

In [9]:
type(x)

pyspark.sql.column.Column

In [10]:
df.withColumn("date", x.otherwise('empty-value')).show()

+-----------+-----------+
|   date_str|       date|
+-----------+-----------+
| 2020-01-03| 2020-01-03|
| 2020 01 13|empty-value|
|2020 Jan 18|empty-value|
+-----------+-----------+



In [11]:
df_time = df.withColumn("date & time", F.when(F.to_timestamp(F.col('date_str'), "yyyy-MM-dd").isNotNull(), 
                                        F.to_timestamp(F.col('date_str'), "yyyy-MM-dd")
                                        ).otherwise(F.when(F.to_timestamp(F.col('date_str'), "yyyy MM dd").isNotNull(), 
                                                    F.to_timestamp(F.col('date_str'), "yyyy MM dd")
                                                        ).otherwise(F.when(F.to_timestamp(F.col('date_str'), "yyyy MMM dd").isNotNull(), 
                                                                            F.to_timestamp(F.col('date_str'), "yyyy MMM dd")
                                                                            ).otherwise(F.to_timestamp(F.col('date_str'), "MM dd yyyy"))
                                                                    )
                                                    )
                        )
# giant shit-show, better to break up for debugging

In [12]:
df_time.printSchema()

root
 |-- date_str: string (nullable = true)
 |-- date & time: timestamp (nullable = true)



In [13]:
df_date = df_time.withColumn("date", F.when(F.to_date(F.col('date_str'), "yyyy-MM-dd").isNotNull(), 
                                        F.to_date(F.col('date_str'), "yyyy-MM-dd")
                                        ).otherwise(F.when(F.to_date(F.col('date_str'), "yyyy MM dd").isNotNull(), 
                                                    F.to_date(F.col('date_str'), "yyyy MM dd")
                                                        ).otherwise(F.when(F.to_date(F.col('date_str'), "yyyy MMM dd").isNotNull(), 
                                                                            F.to_date(F.col('date_str'), "yyyy MMM dd")
                                                                            ).otherwise(F.to_date(F.col('date_str'), "MM dd yyyy"))
                                                                    )
                                                    )
                        )
# giant shit-show, better to break up for debugging

In [14]:
df_date.show()

+-----------+-------------------+----------+
|   date_str|        date & time|      date|
+-----------+-------------------+----------+
| 2020-01-03|2020-01-03 00:00:00|2020-01-03|
| 2020 01 13|2020-01-13 00:00:00|2020-01-13|
|2020 Jan 18|2020-01-18 00:00:00|2020-01-18|
+-----------+-------------------+----------+



In [15]:
df_date.printSchema()

root
 |-- date_str: string (nullable = true)
 |-- date & time: timestamp (nullable = true)
 |-- date: date (nullable = true)

