In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[3]").appName("dateApp").getOrCreate()

## to_date(col,format=None)
+ Converts string to date format 

In [205]:
from pyspark.sql.functions import to_date

df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
df.printSchema()
df.select(to_date("t").alias('date')).show()
df.select(to_date("t").alias('date')).printSchema()

root
 |-- t: string (nullable = true)

+----------+
|      date|
+----------+
|1997-02-28|
+----------+

root
 |-- date: date (nullable = true)



+ This is same as calling cast on string column with default format

In [210]:
df.select(col("t").cast("date")).show()

+----------+
|         t|
+----------+
|1997-02-28|
+----------+



+ Returns null if the format of the string in the column is not default

In [209]:
df_1 = spark.createDataFrame([('1997/02/28 10:30:00',)], ['t1'])
df_1.select(to_date("t1").alias('date')).show()

+----+
|date|
+----+
|null|
+----+



+ Format should match the input column values . 'yyyy/MM/dd hh:mm:ss' in this case

In [199]:
df_1.select(to_date("t1","yyy/MM/dd hh:mm:ss").alias('date')).show()

+----------+
|      date|
+----------+
|1997-02-28|
+----------+



## date_format(date,format)
+ Converts a date/timestamp/string to a value of string 
+ in the format specified by the date format given by the second argument

In [99]:
from pyspark.sql.functions import date_format

df = spark.createDataFrame([('2015-04-08',)], ['dt'])
df.select(date_format('dt', 'MM/dd/yyy').alias('date')).show()

+----------+
|      date|
+----------+
|04/08/2015|
+----------+



## date_add(start,days) and date_sub(start,days)

In [89]:
from pyspark.sql.functions import date_add,date_sub

df = spark.createDataFrame([('2021-04-08',)], ['dt'])
df.select(date_add(df.dt, 1).alias('next_date')).show()
df.select(date_add(df.dt, 365).alias('one_year_later')).show()

df.select(date_sub(df.dt, 1).alias('prev_date')).show()

+----------+
| next_date|
+----------+
|2021-04-09|
+----------+

+--------------+
|one_year_later|
+--------------+
|    2022-04-08|
+--------------+

+----------+
| prev_date|
+----------+
|2021-04-07|
+----------+



## datediff(end,start) 

In [188]:
from pyspark.sql.functions import col,datediff

df = spark.createDataFrame([('2021-03-18','2021-07-28')], ['dt1','dt2'])
df.select(datediff(col("dt2"),col("dt1"))).show()

+------------------+
|datediff(dt2, dt1)|
+------------------+
|               132|
+------------------+



In [150]:
df.select(datediff("dt2","dt1")).show()

+------------------+
|datediff(dt2, dt1)|
+------------------+
|               132|
+------------------+



## monthsbetween(date1,date2,roundOff=True)

+ Returns number of months between dates date1 and date2.
+ If date1 is later than date2, then the result is positive. 
+ If date1 and date2 are on the same day of month, or both are the last day of month, returns an integer (time of day will be ignored). 
+ The result is rounded off to 8 digits unless roundOff is set to False.

In [151]:
from pyspark.sql.functions import months_between

df = spark.createDataFrame([('2021-03-18','2021-07-28')], ['dt1','dt2'])
df.select("dt2","dt1",months_between("dt2","dt1")).show()

+----------+----------+------------------------------+
|       dt2|       dt1|months_between(dt2, dt1, true)|
+----------+----------+------------------------------+
|2021-07-28|2021-03-18|                    4.32258065|
+----------+----------+------------------------------+



In [152]:
df.select("dt2","dt1",months_between("dt2","dt1",False)).show()

+----------+----------+-------------------------------+
|       dt2|       dt1|months_between(dt2, dt1, false)|
+----------+----------+-------------------------------+
|2021-07-28|2021-03-18|               4.32258064516129|
+----------+----------+-------------------------------+



## dayofmonth(col), dayofweek(col) ,dayofyear(col), weekofyear(col)

+ dayofweek starts from Sunday by default

In [190]:
from pyspark.sql.functions import dayofweek,dayofmonth,dayofyear,weekofyear

df = spark.createDataFrame([('2021-03-18',)], ['dt1'])

In [191]:
df.select("dt1",dayofweek("dt1"),dayofmonth("dt1"),dayofyear("dt1"),weekofyear("dt1")).show()

+----------+--------------+---------------+--------------+---------------+
|       dt1|dayofweek(dt1)|dayofmonth(dt1)|dayofyear(dt1)|weekofyear(dt1)|
+----------+--------------+---------------+--------------+---------------+
|2021-03-18|             5|             18|            77|             11|
+----------+--------------+---------------+--------------+---------------+



## last_day(date) 

+ Returns last_day of the month of the date passed

In [143]:
from pyspark.sql.functions import last_day

In [144]:
df.select("dt1",last_day("dt1")).show()

+----------+-------------+
|       dt1|last_day(dt1)|
+----------+-------------+
|2021-03-18|   2021-03-31|
+----------+-------------+



## next_day(date,dayofweek)

+ Returns the first date which is later than the value of the date column.
+ Day of the week parameter is case insensitive, and accepts:
  “Mon”, “Tue”, “Wed”, “Thu”, “Fri”, “Sat”, “Sun”.

In [159]:
from pyspark.sql.functions import dayofweek,next_day

df.select("dt1",dayofweek("dt1"),next_day("dt1","Thu")).show()

+----------+--------------+------------------+
|       dt1|dayofweek(dt1)|next_day(dt1, Thu)|
+----------+--------------+------------------+
|2021-03-18|             5|        2021-03-25|
+----------+--------------+------------------+



# Timetsamp

## from_unixtime(timestamp,format='uuuu-MM-dd HH:mm:ss')

In [171]:
from pyspark.sql.functions import from_unixtime,unix_timestamp

spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")

In [181]:
time_df = spark.createDataFrame([(1428476400,)], ['unix_time'])
time_df.show()

+----------+
| unix_time|
+----------+
|1428476400|
+----------+



In [182]:
time_df_new = time_df.select(from_unixtime('unix_time').alias('ts'))
time_df_new.show()

+-------------------+
|                 ts|
+-------------------+
|2015-04-08 00:00:00|
+-------------------+



In [183]:
# conver from unix timestamp to specified format
time_df_new2 = time_df.select(from_unixtime('unix_time',"yyyy/MM/dd").alias('ts1'))
time_df_new2.show()

+----------+
|       ts1|
+----------+
|2015/04/08|
+----------+



## unixtimestamp(timestamp=None,format='uuuu-MM-dd HH:mm:ss')
+ Convert time string with given pattern (‘uuuu-MM-dd HH:mm:ss’, by default) to Unix time stamp (in seconds), 
+ using the default timezone and the default locale, 
+ return null if fail.

In [184]:
time_df_new.select(unix_timestamp("ts")).show()

+---------------------------------------+
|unix_timestamp(ts, yyyy-MM-dd HH:mm:ss)|
+---------------------------------------+
|                             1428476400|
+---------------------------------------+



In [185]:
spark.conf.unset("spark.sql.session.timeZone")

## to_timestamp(col,format=None)
+ Default format is 'yyyy-mm-dd HH:mm:ss'
+ Pass format value if the source value is in any other format

In [130]:
df = spark.createDataFrame([('1997-02-28 10:30:25','1997/02/28 08:30:00')], ['timestamp_col1','timestamp_col2'])
df.show()

+-------------------+-------------------+
|     timestamp_col1|     timestamp_col2|
+-------------------+-------------------+
|1997-02-28 10:30:00|1997/02/28 08:30:00|
+-------------------+-------------------+



In [131]:
from pyspark.sql.functions import to_timestamp,col

In [133]:
df.printSchema()

root
 |-- timestamp_col1: string (nullable = true)
 |-- timestamp_col2: string (nullable = true)



In [134]:
# If formatting is not passed, default format of 'yyyy-mm-dd HH:mm:ss' is used
df_t1 = df.select(to_timestamp("timestamp_col1").alias("timestamp_col1"),"timestamp_col2")
df_t1.show()

+-------------------+-------------------+
|     timestamp_col1|     timestamp_col2|
+-------------------+-------------------+
|1997-02-28 10:30:00|1997/02/28 08:30:00|
+-------------------+-------------------+



In [136]:
# Notice the change in datatype of the column passed to to_timestamp
df_t1.printSchema()

root
 |-- timestamp_col1: timestamp (nullable = true)
 |-- timestamp_col2: string (nullable = true)



In [137]:
# Returns null if column value is not in the default format of 'yyyy-mm-dd HH:mm:ss'
# and expected source formatting is not passed and 
df_t2 = df_t1.select("timestamp_col1",to_timestamp("timestamp_col2"))
df_t2.show()

+-------------------+------------------------------+
|     timestamp_col1|to_timestamp(`timestamp_col2`)|
+-------------------+------------------------------+
|1997-02-28 10:30:00|                          null|
+-------------------+------------------------------+



In [138]:
# default operation is same as cast('timestamp').
df.select(col("timestamp_col1").cast("timestamp"),col("timestamp_col2").cast("timestamp")).printSchema()
df.select(col("timestamp_col1").cast("timestamp"),col("timestamp_col2").cast("timestamp")).show()

root
 |-- timestamp_col1: timestamp (nullable = true)
 |-- timestamp_col2: timestamp (nullable = true)

+-------------------+--------------+
|     timestamp_col1|timestamp_col2|
+-------------------+--------------+
|1997-02-28 10:30:00|          null|
+-------------------+--------------+



In [82]:
# Pass source formatting as in :to_timestamp("col",'yyyy/MM/dd HH:mm:ss')
df_t1.show()
df_t3 = df_t1.select("timestamp_col1",to_timestamp("timestamp_col2",'yyyy/MM/dd HH:mm:ss').alias("timestamp_col2"))
df_t3.printSchema()
df_t3.show()

+-------------------+-------------------+
|     timestamp_col1|     timestamp_col2|
+-------------------+-------------------+
|1997-02-28 10:30:00|1997/02/28 08:30:00|
+-------------------+-------------------+

root
 |-- timestamp_col1: timestamp (nullable = true)
 |-- timestamp_col2: timestamp (nullable = true)

+-------------------+-------------------+
|     timestamp_col1|     timestamp_col2|
+-------------------+-------------------+
|1997-02-28 10:30:00|1997-02-28 08:30:00|
+-------------------+-------------------+



## hour(col), minute(col), second(col)

In [139]:
df_t3.show()

+-------------------+-------------------+
|     timestamp_col1|     timestamp_col2|
+-------------------+-------------------+
|1997-02-28 10:30:00|1997-02-28 08:30:00|
+-------------------+-------------------+



In [145]:
from pyspark.sql.functions import hour,minute,second,month

df_t3.select("timestamp_col1",hour("timestamp_col1"),minute("timestamp_col1"),second("timestamp_col1"),month("timestamp_col1")).show()

+-------------------+--------------------+----------------------+----------------------+---------------------+
|     timestamp_col1|hour(timestamp_col1)|minute(timestamp_col1)|second(timestamp_col1)|month(timestamp_col1)|
+-------------------+--------------------+----------------------+----------------------+---------------------+
|1997-02-28 10:30:00|                  10|                    30|                     0|                    2|
+-------------------+--------------------+----------------------+----------------------+---------------------+



## date_trunc(format,timestamp)
+ Returns timestamp truncated to the unit specified by the format.

In [103]:
from pyspark.sql.functions import date_trunc

df = spark.createDataFrame([('1997-02-28 05:02:11',)], ['timestamp_col'])

In [107]:
df.select(date_trunc('year',"timestamp_col")).show()

+-------------------------------+
|date_trunc(year, timestamp_col)|
+-------------------------------+
|            1997-01-01 00:00:00|
+-------------------------------+



In [108]:
df.select(date_trunc('mon',"timestamp_col")).show()

+------------------------------+
|date_trunc(mon, timestamp_col)|
+------------------------------+
|           1997-02-01 00:00:00|
+------------------------------+



In [109]:
df.select(date_trunc('day',"timestamp_col")).show()

+------------------------------+
|date_trunc(day, timestamp_col)|
+------------------------------+
|           1997-02-28 00:00:00|
+------------------------------+

