In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.master('local').appName('aggDataFrames').enableHiveSupport().getOrCreate()
sc=spark.sparkContext

In [3]:
orders = spark.read.csv('data//retail_db//orders.csv',header=True,inferSchema=True)
order_items = spark.read.csv('data//retail_db//order_items.csv',header=True,inferSchema=True)
customers = spark.read.csv('data//retail_db//customers.csv',header=True,inferSchema=True)
products = spark.read.csv('data//retail_db//products.csv',header=True,inferSchema=True)

## current_date()

In [4]:
df =spark.createDataFrame([(1,)])

In [5]:
df.select(current_date()).show()

+--------------+
|current_date()|
+--------------+
|    2020-04-04|
+--------------+



## current_timestamp()

In [13]:
df.select(current_timestamp()).show()

+--------------------+
| current_timestamp()|
+--------------------+
|2020-04-04 08:45:...|
+--------------------+



In [12]:
df.select(current_timestamp()).take(1)

[Row(current_timestamp()=datetime.datetime(2020, 4, 4, 8, 45, 43, 849000))]

## date_format()

### Converts Date/Timestamp to String Format

In [15]:
df.select(date_format(current_timestamp(),'yyyy-MM-dd a HH:hh:mm:ss.S').alias('date')).show()

+--------------------+
|                date|
+--------------------+
|2020-04-04 AM 08:...|
+--------------------+



In [16]:
df.select(date_format(current_timestamp(),'yyyy-MM-dd a HH:hh:mm:ss.S').alias('date')).take(1)

[Row(date='2020-04-04 AM 08:08:46:14.421')]

## Extraction in date

In [18]:
df.select(year(current_timestamp()).alias('date_year')).show()

+---------+
|date_year|
+---------+
|     2020|
+---------+



In [19]:
df.select(month(current_timestamp()).alias('date_month')).show()

+----------+
|date_month|
+----------+
|         4|
+----------+



### Options for extraction are 
year(), quarter(), month(), dayofmonth(), dayofweek(), dayofyear(), hour(), minute(), second(), weekofyear()



## date_add

In [24]:
orders.select(orders.order_date, date_add(orders.order_date,1)).show(5)

+-----------------------+
|date_add(order_date, 1)|
+-----------------------+
|             2013-07-26|
|             2013-07-26|
|             2013-07-26|
|             2013-07-26|
|             2013-07-26|
+-----------------------+
only showing top 5 rows



In [26]:
orders.select(orders.order_date, date_add(orders.order_date,-1)).show(5)
#Negative parameter works

+------------------------+
|date_add(order_date, -1)|
+------------------------+
|              2013-07-24|
|              2013-07-24|
|              2013-07-24|
|              2013-07-24|
|              2013-07-24|
+------------------------+
only showing top 5 rows



## date_sub

In [25]:
orders.select(orders.order_date, date_sub(orders.order_date,1)).show(5)

+-----------------------+
|date_sub(order_date, 1)|
+-----------------------+
|             2013-07-24|
|             2013-07-24|
|             2013-07-24|
|             2013-07-24|
|             2013-07-24|
+-----------------------+
only showing top 5 rows



In [28]:
orders.select(orders.order_date, date_sub(orders.order_date,-1)).show(5)
#Negative parameter works

+------------------------+
|date_sub(order_date, -1)|
+------------------------+
|              2013-07-26|
|              2013-07-26|
|              2013-07-26|
|              2013-07-26|
|              2013-07-26|
+------------------------+
only showing top 5 rows



## date_diff

In [30]:
orders.select(datediff(orders.order_date,date_sub(orders.order_date,1))).show(5)   

+---------------------------------------------+
|datediff(order_date, date_sub(order_date, 1))|
+---------------------------------------------+
|                                            1|
|                                            1|
|                                            1|
|                                            1|
|                                            1|
+---------------------------------------------+
only showing top 5 rows



## add_months

In [33]:
orders.select(orders.order_date, add_months(orders.order_date,1)).show(5)

+-------------------+-------------------------+
|         order_date|add_months(order_date, 1)|
+-------------------+-------------------------+
|2013-07-25 00:00:00|               2013-08-25|
|2013-07-25 00:00:00|               2013-08-25|
|2013-07-25 00:00:00|               2013-08-25|
|2013-07-25 00:00:00|               2013-08-25|
|2013-07-25 00:00:00|               2013-08-25|
+-------------------+-------------------------+
only showing top 5 rows



In [35]:
orders.select(orders.order_date, add_months(orders.order_date,-1)).show(5)
#Negative parameter works

+-------------------+--------------------------+
|         order_date|add_months(order_date, -1)|
+-------------------+--------------------------+
|2013-07-25 00:00:00|                2013-06-25|
|2013-07-25 00:00:00|                2013-06-25|
|2013-07-25 00:00:00|                2013-06-25|
|2013-07-25 00:00:00|                2013-06-25|
|2013-07-25 00:00:00|                2013-06-25|
+-------------------+--------------------------+
only showing top 5 rows



## months_between

In [39]:
orders.select(months_between(date_add(add_months(orders.order_date,1),10), orders.order_date).alias('months_btwn')).show(5)

+-----------+
|months_btwn|
+-----------+
| 1.32258065|
| 1.32258065|
| 1.32258065|
| 1.32258065|
| 1.32258065|
+-----------+
only showing top 5 rows



## next_day
#### Gets the date of the next upcoming day

In [41]:
orders.select(orders.order_date, next_day(orders.order_date,'sun').alias('Next_Sunday')).show(5)

+-------------------+-----------+
|         order_date|Next_Sunday|
+-------------------+-----------+
|2013-07-25 00:00:00| 2013-07-28|
|2013-07-25 00:00:00| 2013-07-28|
|2013-07-25 00:00:00| 2013-07-28|
|2013-07-25 00:00:00| 2013-07-28|
|2013-07-25 00:00:00| 2013-07-28|
+-------------------+-----------+
only showing top 5 rows



## last_day
#### Gets the last day of the month

In [42]:
orders.select(orders.order_date, last_day(orders.order_date).alias('Last_day_of_the_month')).show(5)

+-------------------+---------------------+
|         order_date|Last_day_of_the_month|
+-------------------+---------------------+
|2013-07-25 00:00:00|           2013-07-31|
|2013-07-25 00:00:00|           2013-07-31|
|2013-07-25 00:00:00|           2013-07-31|
|2013-07-25 00:00:00|           2013-07-31|
|2013-07-25 00:00:00|           2013-07-31|
+-------------------+---------------------+
only showing top 5 rows



## trunc

In [45]:
orders.select(trunc(orders.order_date,'year'),trunc(orders.order_date,'month')).show(5)



+-----------------------+------------------------+
|trunc(order_date, year)|trunc(order_date, month)|
+-----------------------+------------------------+
|             2013-01-01|              2013-07-01|
|             2013-01-01|              2013-07-01|
|             2013-01-01|              2013-07-01|
|             2013-01-01|              2013-07-01|
|             2013-01-01|              2013-07-01|
+-----------------------+------------------------+
only showing top 5 rows



# Unix Date/Time

## from_unixtime 
#### Converts seconds after epoch date (1, Jan, 1970) to timestamp

In [46]:
time_df = spark.createDataFrame([(1428476400,)], ['unix_time'])
time_df.select(from_unixtime('unix_time', format="yyyy~MM~dd HH:mm:ss.SSSSSS").alias('ts')).collect()

[Row(ts='2015~04~08 07:00:00.000000')]

In [47]:
time_df.select(from_unixtime('unix_time', format="yyyy~MM~dd HH:mm:ss.SSSSSS").alias('ts')).show()

+--------------------+
|                  ts|
+--------------------+
|2015~04~08 07:00:...|
+--------------------+



## unix_timestamp 
#### Converts date/timestamp into seconds after epoch date

In [48]:
orders.select(unix_timestamp(orders.order_date)).show(5)

+-----------------------------------------------+
|unix_timestamp(order_date, yyyy-MM-dd HH:mm:ss)|
+-----------------------------------------------+
|                                     1374710400|
|                                     1374710400|
|                                     1374710400|
|                                     1374710400|
|                                     1374710400|
+-----------------------------------------------+
only showing top 5 rows

