# Date functions

In [1]:
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession, Row
import pyspark.sql.functions as F
import pyspark.sql.window as W

In [2]:
spark = SparkSession \
    .builder \
    .appName("Spark Training - DF APIs") \
    .getOrCreate()

In [3]:
ord = spark.read.load('PracticeFiles/Orders', sep=',', format='csv', schema=('order_id int,order_date timestamp, order_customer_id int, order_status string'))
ord.show(5)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
+--------+-------------------+-----------------+---------------+
only showing top 5 rows



In [5]:
ord_new = ord.withColumn('new_order_date', F.date_add(ord.order_date, 50))
ord_new.show(5)

+--------+-------------------+-----------------+---------------+--------------+
|order_id|         order_date|order_customer_id|   order_status|new_order_date|
+--------+-------------------+-----------------+---------------+--------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|    2013-09-13|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|    2013-09-13|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|    2013-09-13|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|    2013-09-13|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|    2013-09-13|
+--------+-------------------+-----------------+---------------+--------------+
only showing top 5 rows



### 1. current_date(), current_timestamp()
- Returns current date, timestamp

In [7]:
# current_date
ord.withColumn('current_date', F.current_date()).show(5)

+--------+-------------------+-----------------+---------------+------------+
|order_id|         order_date|order_customer_id|   order_status|current_date|
+--------+-------------------+-----------------+---------------+------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|  2022-01-27|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|  2022-01-27|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|  2022-01-27|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|  2022-01-27|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|  2022-01-27|
+--------+-------------------+-----------------+---------------+------------+
only showing top 5 rows



In [9]:
# current timestamp
ord.withColumn('current_ts', F.current_timestamp()).show(5, truncate=False)

+--------+-------------------+-----------------+---------------+-----------------------+
|order_id|order_date         |order_customer_id|order_status   |current_ts             |
+--------+-------------------+-----------------+---------------+-----------------------+
|1       |2013-07-25 00:00:00|11599            |CLOSED         |2022-01-27 14:46:14.179|
|2       |2013-07-25 00:00:00|256              |PENDING_PAYMENT|2022-01-27 14:46:14.179|
|3       |2013-07-25 00:00:00|12111            |COMPLETE       |2022-01-27 14:46:14.179|
|4       |2013-07-25 00:00:00|8827             |CLOSED         |2022-01-27 14:46:14.179|
|5       |2013-07-25 00:00:00|11318            |COMPLETE       |2022-01-27 14:46:14.179|
+--------+-------------------+-----------------+---------------+-----------------------+
only showing top 5 rows



### 2. next_day(date, dayOfWeek)
- Returns the first date which is later than the value of the date column
- e.g. return the next monday to a provided date
- dayOfWeek: Mon, Tue, Wed, Thu, Fri, Sat, Sun

In [12]:
ord.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [16]:
# next day
ord.select('order_date', F.next_day(ord.order_date, 'Fri'))\
    .distinct()\
    .show(5, truncate=False)

+-------------------+-------------------------+
|order_date         |next_day(order_date, Fri)|
+-------------------+-------------------------+
|2013-12-31 00:00:00|2014-01-03               |
|2014-06-02 00:00:00|2014-06-06               |
|2013-08-10 00:00:00|2013-08-16               |
|2013-09-01 00:00:00|2013-09-06               |
|2014-01-12 00:00:00|2014-01-17               |
+-------------------+-------------------------+
only showing top 5 rows



### 3. last_day(date)
- Returns the last day of the month which the given date belongs to

In [18]:
# last day of month
ord.select('order_date', F.last_day(ord.order_date))\
    .distinct()\
    .show(5, truncate=False)

+-------------------+--------------------+
|order_date         |last_day(order_date)|
+-------------------+--------------------+
|2013-11-30 00:00:00|2013-11-30          |
|2013-12-22 00:00:00|2013-12-31          |
|2013-11-12 00:00:00|2013-11-30          |
|2013-10-19 00:00:00|2013-10-31          |
|2014-05-17 00:00:00|2014-05-31          |
+-------------------+--------------------+
only showing top 5 rows



### 4. dayofweek(col), dayofmonth(col), dayofyear(col), weekofyear(col)

In [19]:
ord.select('order_date', F.dayofweek(ord.order_date), F.dayofmonth(ord.order_date), F.dayofyear(ord.order_date), F.weekofyear(ord.order_date))\
    .distinct()\
    .show(5, truncate=False)

+-------------------+---------------------+----------------------+---------------------+----------------------+
|order_date         |dayofweek(order_date)|dayofmonth(order_date)|dayofyear(order_date)|weekofyear(order_date)|
+-------------------+---------------------+----------------------+---------------------+----------------------+
|2014-01-27 00:00:00|2                    |27                    |27                   |5                     |
|2014-04-03 00:00:00|5                    |3                     |93                   |14                    |
|2014-05-16 00:00:00|6                    |16                    |136                  |20                    |
|2013-09-02 00:00:00|2                    |2                     |245                  |36                    |
|2013-08-01 00:00:00|5                    |1                     |213                  |31                    |
+-------------------+---------------------+----------------------+---------------------+----------------

### 5. second(col), minute(col), hour(col), month(col), quarter(col), year(col)

In [20]:
ord.select('order_date', F.second(ord.order_date))\
    .distinct()\
    .show(5, truncate=False)

+-------------------+------------------+
|order_date         |second(order_date)|
+-------------------+------------------+
|2013-07-29 00:00:00|0                 |
|2014-07-17 00:00:00|0                 |
|2014-07-22 00:00:00|0                 |
|2013-10-17 00:00:00|0                 |
|2013-12-23 00:00:00|0                 |
+-------------------+------------------+
only showing top 5 rows



### 6. months_between(date1, date2, roundOff=True)
The result is rounded off to 8 digits unless `roundOff` is set to False

In [22]:
# work out number of months between order date and today
ord.select('order_date', F.months_between(ord.order_date, F.current_date()))\
    .distinct()\
    .show(5, truncate=False)

+-------------------+------------------------------------------------+
|order_date         |months_between(order_date, current_date(), true)|
+-------------------+------------------------------------------------+
|2013-09-22 00:00:00|-100.16129032                                   |
|2014-05-31 00:00:00|-91.87096774                                    |
|2013-09-24 00:00:00|-100.09677419                                   |
|2013-12-08 00:00:00|-97.61290323                                    |
|2014-06-29 00:00:00|-90.93548387                                    |
+-------------------+------------------------------------------------+
only showing top 5 rows



In [24]:
ord.select('order_date', F.months_between(ord.order_date, F.current_date(), roundOff=False))\
    .distinct()\
    .show(5, truncate=False)

+-------------------+-------------------------------------------------+
|order_date         |months_between(order_date, current_date(), false)|
+-------------------+-------------------------------------------------+
|2013-09-17 00:00:00|-100.3225806451613                               |
|2013-12-13 00:00:00|-97.45161290322581                               |
|2014-05-28 00:00:00|-91.96774193548387                               |
|2014-02-06 00:00:00|-95.6774193548387                                |
|2013-10-21 00:00:00|-99.19354838709677                               |
+-------------------+-------------------------------------------------+
only showing top 5 rows



### 7. date_add(start, days), date_sub(start, days), add_months(start, months)
- Adds, or subtracts Number of days. Adds months

In [28]:
# add 5 days and 5 months to order date
ord.select('order_date', F.date_add(ord.order_date, 5).alias('date_plus_5d'), F.add_months(ord.order_date, 5).alias('date_plus_5m'))\
    .distinct()\
    .show(5, truncate=False)

+-------------------+------------+------------+
|order_date         |date_plus_5d|date_plus_5m|
+-------------------+------------+------------+
|2013-08-26 00:00:00|2013-08-31  |2014-01-26  |
|2014-03-03 00:00:00|2014-03-08  |2014-08-03  |
|2013-11-14 00:00:00|2013-11-19  |2014-04-14  |
|2013-12-06 00:00:00|2013-12-11  |2014-05-06  |
|2013-08-13 00:00:00|2013-08-18  |2014-01-13  |
+-------------------+------------+------------+
only showing top 5 rows



### 8. datediff(end, start)
Returns number of days from 'start' to 'end'

In [34]:
ord.select('order_date',F.current_date(), F.datediff('order_date', F.current_date())).distinct().show(5)

+-------------------+--------------+------------------------------------+
|         order_date|current_date()|datediff(order_date, current_date())|
+-------------------+--------------+------------------------------------+
|2013-08-27 00:00:00|    2022-01-27|                               -3075|
|2014-06-14 00:00:00|    2022-01-27|                               -2784|
|2013-08-16 00:00:00|    2022-01-27|                               -3086|
|2013-09-13 00:00:00|    2022-01-27|                               -3058|
|2014-04-02 00:00:00|    2022-01-27|                               -2857|
+-------------------+--------------+------------------------------------+
only showing top 5 rows



### 9. date_trunc(format, timestamp)
- Returns timestamp truncated to the unit specified by the format
- format: 'year', 'yyyy', 'yy', 'month', 'mon', 'mm', 'day', 'dd', 'hour', 'minute', 'second', 'week', 'quarter'

In [41]:
ord.select('order_date', F.date_trunc('yyyy', ord.order_date).alias('first_day_of_yr')).show(5)

+-------------------+-------------------+
|         order_date|    first_day_of_yr|
+-------------------+-------------------+
|2013-07-25 00:00:00|2013-01-01 00:00:00|
|2013-07-25 00:00:00|2013-01-01 00:00:00|
|2013-07-25 00:00:00|2013-01-01 00:00:00|
|2013-07-25 00:00:00|2013-01-01 00:00:00|
|2013-07-25 00:00:00|2013-01-01 00:00:00|
+-------------------+-------------------+
only showing top 5 rows



In [40]:
ord.select('order_date', F.date_trunc('mm', ord.order_date).alias('first_day_of_month')).show(5)

+-------------------+-------------------+
|         order_date| first_day_of_month|
+-------------------+-------------------+
|2013-07-25 00:00:00|2013-07-01 00:00:00|
|2013-07-25 00:00:00|2013-07-01 00:00:00|
|2013-07-25 00:00:00|2013-07-01 00:00:00|
|2013-07-25 00:00:00|2013-07-01 00:00:00|
|2013-07-25 00:00:00|2013-07-01 00:00:00|
+-------------------+-------------------+
only showing top 5 rows



### 10. date_format(date, format)
converts a date/timestamp/string to a value of string in the format specified by the date

In [44]:
ord.show(5)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
+--------+-------------------+-----------------+---------------+
only showing top 5 rows



In [51]:
ord.select('order_date', F.date_format(ord.order_date, 'yyyy/MM/dd').alias('reformated_date')).show(5)

+-------------------+---------------+
|         order_date|reformated_date|
+-------------------+---------------+
|2013-07-25 00:00:00|     2013/07/25|
|2013-07-25 00:00:00|     2013/07/25|
|2013-07-25 00:00:00|     2013/07/25|
|2013-07-25 00:00:00|     2013/07/25|
|2013-07-25 00:00:00|     2013/07/25|
+-------------------+---------------+
only showing top 5 rows



### 11. unix_timestamp(timestamp=None, format='yyyy-MM-dd HH:mm:ss')
converts times tring with given pattern (yyyy-MM-dd HH:mm:ss) to Unix time stamp (in seconds)

In [53]:
spark.range(1).select(F.current_timestamp(), F.unix_timestamp(F.current_timestamp())).show()

+--------------------+--------------------------------------------------------+
| current_timestamp()|unix_timestamp(current_timestamp(), yyyy-MM-dd HH:mm:ss)|
+--------------------+--------------------------------------------------------+
|2022-01-27 16:10:...|                                              1643299857|
+--------------------+--------------------------------------------------------+



### 12. to_timestamp()
converts a column into timestamp

In [54]:
df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
df.show()

+-------------------+
|                  t|
+-------------------+
|1997-02-28 10:30:00|
+-------------------+



In [55]:
# verify that t is not a timestamp
df.printSchema()

root
 |-- t: string (nullable = true)



In [56]:
df.select(F.to_timestamp(df.t)).show()

+-------------------+
|    to_timestamp(t)|
+-------------------+
|1997-02-28 10:30:00|
+-------------------+



### 13. from_unixtime(timestamp, format='yyyy-MM-dd HH:mm:ss')
Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) to string representing the timestamp of that moment in the current system time zone in the given format

In [58]:
time_df = spark.createDataFrame([(1428476400,)], ['unix_time'])
time_df.show()

+----------+
| unix_time|
+----------+
|1428476400|
+----------+



In [59]:
time_df.select(F.from_unixtime('unix_time')).show()

+---------------------------------------------+
|from_unixtime(unix_time, yyyy-MM-dd HH:mm:ss)|
+---------------------------------------------+
|                          2015-04-08 08:00:00|
+---------------------------------------------+



### 14 from_utc_timestamp(timestamp, tz)
This is a common function for databases supporting TIMESTAMP WITHOUT TIMEZONE. this function takes a timestamp which is timezone-agnostic, and interprets it as a timestamp in UTC, and renders timestamp as a timestamp in the given zone

In [62]:
df = spark.createDataFrame([('1997-02-28 10:30:00', 'JST')], ['ts', 'tz'])
df.show()

+-------------------+---+
|                 ts| tz|
+-------------------+---+
|1997-02-28 10:30:00|JST|
+-------------------+---+



In [65]:
df.select('ts',F.from_utc_timestamp(df.ts, 'PST').alias('pst_time')).show()

+-------------------+-------------------+
|                 ts|           pst_time|
+-------------------+-------------------+
|1997-02-28 10:30:00|1997-02-28 02:30:00|
+-------------------+-------------------+



### 15. to_date(col, format=None)
convert to date format (from timestamp)

In [66]:
ord.show(5)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
+--------+-------------------+-----------------+---------------+
only showing top 5 rows



In [67]:
ord.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [70]:
# lets convert the order_date from timestamp to datetime
ord.select('order_date', F.to_date('order_date', 'yyyy-mm-dd')).show(5)

+-------------------+-------------------------------+
|         order_date|to_date(order_date, yyyy-mm-dd)|
+-------------------+-------------------------------+
|2013-07-25 00:00:00|                     2013-07-25|
|2013-07-25 00:00:00|                     2013-07-25|
|2013-07-25 00:00:00|                     2013-07-25|
|2013-07-25 00:00:00|                     2013-07-25|
|2013-07-25 00:00:00|                     2013-07-25|
+-------------------+-------------------------------+
only showing top 5 rows



In [None]:
a