- We can use unix_timestamp to convert regular date or timestamp to a unix timestamp value

- We can use from_unixtime to convert unix timestamp to regular date or timestamp

In [0]:
from pyspark.sql.functions import *

In [0]:
datetimes = [
    (20140228,"2014-02-28", "2014-02-28 10:00:00.123"),
    (20160329,"2016-03-29", "2016-03-29 11:23:00.234"),
    (20180420,"2018-04-20", "2018-04-20 12:34:00.543"),
    (20190512,"2019-05-12", "2019-05-12 13:21:00.567")
]

In [0]:
datedf = spark.createDataFrame(datetimes).toDF("dateid","date","time")

In [0]:
datedf.show(truncate=False)

+--------+----------+-----------------------+
|dateid  |date      |time                   |
+--------+----------+-----------------------+
|20140228|2014-02-28|2014-02-28 10:00:00.123|
|20160329|2016-03-29|2016-03-29 11:23:00.234|
|20180420|2018-04-20|2018-04-20 12:34:00.543|
|20190512|2019-05-12|2019-05-12 13:21:00.567|
+--------+----------+-----------------------+



In [0]:
help(unix_timestamp)

Help on function unix_timestamp in module pyspark.sql.functions:

unix_timestamp(timestamp: Optional[ForwardRef('ColumnOrName')] = None, format: str = 'yyyy-MM-dd HH:mm:ss') -> pyspark.sql.column.Column
    Convert time string with given pattern ('yyyy-MM-dd HH:mm:ss', by default)
    to Unix time stamp (in seconds), using the default timezone and the default
    locale, returns null if failed.
    
    if `timestamp` is None, then it returns current timestamp.
    
    .. versionadded:: 1.5.0
    
    .. versionchanged:: 3.4.0
        Supports Spark Connect.
    
    Parameters
    ----------
    timestamp : :class:`~pyspark.sql.Column` or str, optional
        timestamps of string values.
    format : str, optional
        alternative format to use for converting (default: yyyy-MM-dd HH:mm:ss).
    
    Returns
    -------
    :class:`~pyspark.sql.Column`
        unix time as long integer.
    
    Examples
    --------
    >>> spark.conf.set("spark.sql.session.timeZone", "America/Lo

In [0]:
datedf. \
    withColumn("unix_date_id", unix_timestamp(col("dateid").cast("string"), "yyyyMMdd")). \
    withColumn("unix_date", unix_timestamp("date", "yyyy-MM-dd")). \
    show(truncate=False)

+--------+----------+-----------------------+------------+----------+
|dateid  |date      |time                   |unix_date_id|unix_date |
+--------+----------+-----------------------+------------+----------+
|20140228|2014-02-28|2014-02-28 10:00:00.123|1393545600  |1393545600|
|20160329|2016-03-29|2016-03-29 11:23:00.234|1459209600  |1459209600|
|20180420|2018-04-20|2018-04-20 12:34:00.543|1524182400  |1524182400|
|20190512|2019-05-12|2019-05-12 13:21:00.567|1557619200  |1557619200|
+--------+----------+-----------------------+------------+----------+



In [0]:
unixtimes = [
    (1393545600, ),
    (1459209600, ),
    (1524182400, ),
    (1557619200, ),
]

In [0]:
unixtimedf = spark.createDataFrame(unixtimes).toDF("unixtime")

In [0]:
unixtimedf.show()

+----------+
|  unixtime|
+----------+
|1393545600|
|1459209600|
|1524182400|
|1557619200|
+----------+



- Get date in yyyyMMdd format and also complete timestamp

In [0]:
unixtimedf. \
    withColumn("date", from_unixtime("unixtime", "yyyyMMdd")). \
    withColumn("time", from_unixtime("unixtime", "yyyy-MM-dd HH:mm:ss.SSS")). \
    show(truncate=False)     

+----------+--------+-----------------------+
|unixtime  |date    |time                   |
+----------+--------+-----------------------+
|1393545600|20140228|2014-02-28 00:00:00.000|
|1459209600|20160329|2016-03-29 00:00:00.000|
|1524182400|20180420|2018-04-20 00:00:00.000|
|1557619200|20190512|2019-05-12 00:00:00.000|
+----------+--------+-----------------------+



- Unix epoch cannot be casted to date, hence this fails

In [0]:
unixtimedf.select(col('unixtime').cast('date')).show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-3134283822221730>, line 1[0m
[0;32m----> 1[0m [43munixtimedf[49m[38;5;241;43m.[39;49m[43mselect[49m[43m([49m[43mcol[49m[43m([49m[38;5;124;43m'[39;49m[38;5;124;43munixtime[39;49m[38;5;124;43m'[39;49m[43m)[49m[38;5;241;43m.[39;49m[43mcast[49m[43m([49m[38;5;124;43m'[39;49m[38;5;124;43mdate[39;49m[38;5;124;43m'[39;49m[43m)[49m[43m)[49m[38;5;241m.[39mshow()

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m---> 48[0m     res [38;5;241m=[39m [43mfunc[49m[43m([49m[38;5;241;43m*[39;49m[43margs[49m[43m,[49m[43m [49m[38;5;

In [0]:
unixtimedf.select(col('unixtime').cast('timestamp')).show()

+-------------------+
|           unixtime|
+-------------------+
|2014-02-28 00:00:00|
|2016-03-29 00:00:00|
|2018-04-20 00:00:00|
|2019-05-12 00:00:00|
+-------------------+

