In [1]:
import pyspark.sql as ps
from pyspark.sql import Row
# import pyspark.types as T # <= wrong one
import pyspark.sql.types as T
import pyspark.sql.functions as F

In [2]:
spark = ps.SparkSession.builder.appName("TimeIntelli").getOrCreate()
spark

In [3]:
rows = [
    Row('2020-01-03'),
    Row('2020 01 13'),
    Row('2020 Jan 18')
]
myrdd = spark.sparkContext.parallelize(rows)

In [4]:
myrdd.collect()

[<Row('2020-01-03')>, <Row('2020 01 13')>, <Row('2020 Jan 18')>]

In [5]:
mySchema = T.StructType(
    [T.StructField(name="date_str", dataType=T.StringType(), nullable=True)]
)

In [6]:
df = spark.createDataFrame(myrdd, schema= mySchema)
df.show()

+-----------+
|   date_str|
+-----------+
| 2020-01-03|
| 2020 01 13|
|2020 Jan 18|
+-----------+



In [7]:
df.printSchema()

root
 |-- date_str: string (nullable = true)



In [8]:
x = F.when(F.to_date(F.col('date_str'), "yyyy-MM-dd").isNotNull(), F.to_date(F.col('date_str'), "yyyy-MM-dd"))
x

Column<b"CASE WHEN (to_date(`date_str`, 'yyyy-MM-dd') IS NOT NULL) THEN to_date(`date_str`, 'yyyy-MM-dd') END">

In [9]:
type(x)

pyspark.sql.column.Column

In [10]:
df.withColumn("date", x.otherwise('empty-value')).show()

+-----------+-----------+
|   date_str|       date|
+-----------+-----------+
| 2020-01-03| 2020-01-03|
| 2020 01 13|empty-value|
|2020 Jan 18|empty-value|
+-----------+-----------+



In [11]:
df_time = df.withColumn("date & time", F.when(F.to_timestamp(F.col('date_str'), "yyyy-MM-dd").isNotNull(), 
                                        F.to_timestamp(F.col('date_str'), "yyyy-MM-dd")
                                        ).otherwise(F.when(F.to_timestamp(F.col('date_str'), "yyyy MM dd").isNotNull(), 
                                                    F.to_timestamp(F.col('date_str'), "yyyy MM dd")
                                                        ).otherwise(F.when(F.to_timestamp(F.col('date_str'), "yyyy MMM dd").isNotNull(), 
                                                                            F.to_timestamp(F.col('date_str'), "yyyy MMM dd")
                                                                            ).otherwise(F.to_timestamp(F.col('date_str'), "MM dd yyyy"))
                                                                    )
                                                    )
                        )
# giant shit-show, better to break up for debugging

In [12]:
df_time.printSchema()

root
 |-- date_str: string (nullable = true)
 |-- date & time: timestamp (nullable = true)



In [13]:
df_date = df_time.withColumn("date", F.when(F.to_date(F.col('date_str'), "yyyy-MM-dd").isNotNull(), 
                                        F.to_date(F.col('date_str'), "yyyy-MM-dd")
                                        ).otherwise(F.when(F.to_date(F.col('date_str'), "yyyy MM dd").isNotNull(), 
                                                    F.to_date(F.col('date_str'), "yyyy MM dd")
                                                        ).otherwise(F.when(F.to_date(F.col('date_str'), "yyyy MMM dd").isNotNull(), 
                                                                            F.to_date(F.col('date_str'), "yyyy MMM dd")
                                                                            ).otherwise(F.to_date(F.col('date_str'), "MM dd yyyy"))
                                                                    )
                                                    )
                        )
# giant shit-show, better to break up for debugging

In [14]:
df_date.show()

+-----------+-------------------+----------+
|   date_str|        date & time|      date|
+-----------+-------------------+----------+
| 2020-01-03|2020-01-03 00:00:00|2020-01-03|
| 2020 01 13|2020-01-13 00:00:00|2020-01-13|
|2020 Jan 18|2020-01-18 00:00:00|2020-01-18|
+-----------+-------------------+----------+



In [15]:
df_date.printSchema()

root
 |-- date_str: string (nullable = true)
 |-- date & time: timestamp (nullable = true)
 |-- date: date (nullable = true)



In [16]:
# date_sub function -- move date
df_date.withColumn("date_sub_10", F.date_sub("date", 10)).show()

+-----------+-------------------+----------+-----------+
|   date_str|        date & time|      date|date_sub_10|
+-----------+-------------------+----------+-----------+
| 2020-01-03|2020-01-03 00:00:00|2020-01-03| 2019-12-24|
| 2020 01 13|2020-01-13 00:00:00|2020-01-13| 2020-01-03|
|2020 Jan 18|2020-01-18 00:00:00|2020-01-18| 2020-01-08|
+-----------+-------------------+----------+-----------+



In [17]:
# date_add function -- move date
df_date.withColumn("date_add_10", F.date_add("date", 20)).show()

+-----------+-------------------+----------+-----------+
|   date_str|        date & time|      date|date_add_10|
+-----------+-------------------+----------+-----------+
| 2020-01-03|2020-01-03 00:00:00|2020-01-03| 2020-01-23|
| 2020 01 13|2020-01-13 00:00:00|2020-01-13| 2020-02-02|
|2020 Jan 18|2020-01-18 00:00:00|2020-01-18| 2020-02-07|
+-----------+-------------------+----------+-----------+



In [18]:
# datediff function -- find the date fifference
df_date.withColumn("date_diff", F.datediff(F.current_date(), "date")).show()

+-----------+-------------------+----------+---------+
|   date_str|        date & time|      date|date_diff|
+-----------+-------------------+----------+---------+
| 2020-01-03|2020-01-03 00:00:00|2020-01-03|      306|
| 2020 01 13|2020-01-13 00:00:00|2020-01-13|      296|
|2020 Jan 18|2020-01-18 00:00:00|2020-01-18|      291|
+-----------+-------------------+----------+---------+



In [19]:
df_date.withColumn('year', F.year('date')) \
       .withColumn('week', F.weekofyear('date')) \
       .withColumn('weekday', F.dayofweek('date')) \
       .show()
# beware the week start from Sunday as 0

+-----------+-------------------+----------+----+----+-------+
|   date_str|        date & time|      date|year|week|weekday|
+-----------+-------------------+----------+----+----+-------+
| 2020-01-03|2020-01-03 00:00:00|2020-01-03|2020|   1|      6|
| 2020 01 13|2020-01-13 00:00:00|2020-01-13|2020|   3|      2|
|2020 Jan 18|2020-01-18 00:00:00|2020-01-18|2020|   3|      7|
+-----------+-------------------+----------+----+----+-------+



In [20]:
# time intelligence filtering
print(F.col("date") > "2020-01-10") # => is first been trans to spark sql
df_date.where(F.col("date") > "2020-01-10").show()

Column<b'(date > 2020-01-10)'>
+-----------+-------------------+----------+
|   date_str|        date & time|      date|
+-----------+-------------------+----------+
| 2020 01 13|2020-01-13 00:00:00|2020-01-13|
|2020 Jan 18|2020-01-18 00:00:00|2020-01-18|
+-----------+-------------------+----------+

