# Dates and TimeStamps

### Start a simple Spark Session

In [1]:
import org.apache.spark.sql.SparkSession

import org.apache.spark.sql.SparkSession


In [2]:
val spark = SparkSession.builder().getOrCreate()

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@344e862b


### Create a DataFrame from Spark Session read csv

In [3]:
val df = spark.read.options(Map(("header","true"),("inferSchema","true"))).csv("CitiGroup2006_2008")

df: org.apache.spark.sql.DataFrame = [Date: timestamp, Open: double ... 4 more fields]


### Show Schema

In [4]:
df.printSchema

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)



### Lot's of options here

http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.functions

<code>$@add_months(startDate:org.apache.spark.sql.Column,numMonths:Int):org.apache.spark.sql.Column</code>

In [6]:
df.show(5)

+-------------------+-----+-----+-----+-----+-------+
|               Date| Open| High|  Low|Close| Volume|
+-------------------+-----+-----+-----+-----+-------+
|2006-01-03 00:00:00|490.0|493.8|481.1|492.9|1537660|
|2006-01-04 00:00:00|488.6|491.0|483.5|483.8|1871020|
|2006-01-05 00:00:00|484.4|487.8|484.0|486.2|1143160|
|2006-01-06 00:00:00|488.8|489.0|482.0|486.2|1370250|
|2006-01-09 00:00:00|486.0|487.4|483.0|483.9|1680740|
+-------------------+-----+-----+-----+-----+-------+
only showing top 5 rows



#### month

In [8]:
df.select(month($"Date")).show(5)

+-----------+
|month(Date)|
+-----------+
|          1|
|          1|
|          1|
|          1|
|          1|
+-----------+
only showing top 5 rows



Or

In [10]:
df.select(month(df("Date"))).show(5)

+-----------+
|month(Date)|
+-----------+
|          1|
|          1|
|          1|
|          1|
|          1|
+-----------+
only showing top 5 rows



#### year

In [11]:
df.select(year($"Date")).show(5)

+----------+
|year(Date)|
+----------+
|      2006|
|      2006|
|      2006|
|      2006|
|      2006|
+----------+
only showing top 5 rows



### Practical Example

In [12]:
val df2 = df.withColumn("Year",year($"Date"))

df2: org.apache.spark.sql.DataFrame = [Date: timestamp, Open: double ... 5 more fields]


In [13]:
df2.show(5)

+-------------------+-----+-----+-----+-----+-------+----+
|               Date| Open| High|  Low|Close| Volume|Year|
+-------------------+-----+-----+-----+-----+-------+----+
|2006-01-03 00:00:00|490.0|493.8|481.1|492.9|1537660|2006|
|2006-01-04 00:00:00|488.6|491.0|483.5|483.8|1871020|2006|
|2006-01-05 00:00:00|484.4|487.8|484.0|486.2|1143160|2006|
|2006-01-06 00:00:00|488.8|489.0|482.0|486.2|1370250|2006|
|2006-01-09 00:00:00|486.0|487.4|483.0|483.9|1680740|2006|
+-------------------+-----+-----+-----+-----+-------+----+
only showing top 5 rows



### Mean per Year, notice large 2008 drop!

In [14]:
val dfavgs = df2.groupBy("Year").mean()

dfavgs: org.apache.spark.sql.DataFrame = [Year: int, avg(Open): double ... 5 more fields]


In [15]:
dfavgs.show()

+----+------------------+-----------------+------------------+------------------+--------------------+---------+
|Year|         avg(Open)|        avg(High)|          avg(Low)|        avg(Close)|         avg(Volume)|avg(Year)|
+----+------------------+-----------------+------------------+------------------+--------------------+---------+
|2007| 478.8549800796812|483.7444223107569|472.82892430278906| 477.8203984063745|   4107307.721115538|   2007.0|
|2006|489.29402390438236|492.4109163346613|486.12868525896414| 489.2697211155379|  1544542.6294820716|   2006.0|
|2008|191.67707509881424|197.3620553359684|  185.007628458498|190.48893280632404|1.3218876802371541E7|   2008.0|
+----+------------------+-----------------+------------------+------------------+--------------------+---------+



In [16]:
dfavgs.select("Year","avg(Close)").show()

+----+------------------+
|Year|        avg(Close)|
+----+------------------+
|2007| 477.8203984063745|
|2006| 489.2697211155379|
|2008|190.48893280632404|
+----+------------------+



### Closing Spark Session

In [17]:
spark.stop()

## Thank You!