# Working with dates

## Download and install Spark

In [None]:
!ls

In [None]:
#!apt-get update
#!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#!wget -q http://archive.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz
#!tar xf spark-2.3.1-bin-hadoop2.7.tgz
#!pip install -q findspark

## Setup environment

In [1]:
import os


import findspark
findspark.init()
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 
spark

## Downloading and preprocessing Chicago's Reported Crime Data

In [None]:
#!wget https://data.cityofchicago.org/api/views/ijzp-q8t2/rows.csv?accessType=DOWNLOAD
#!ls -l

In [None]:
#!mv rows.csv\?accessType\=DOWNLOAD reported-crimes.csv
#!ls -l

In [None]:
from pyspark.sql.functions import to_timestamp,col,lit
rc = spark.read.csv('../../reported-crimes.csv',header=True).withColumn('Date',to_timestamp(col('Date'),'MM/dd/yyyy hh:mm:ss a')).filter(col('Date') < lit('2018-11-12'))
rc.show(5)

## Working with dates

In [2]:
from pyspark.sql.functions import to_date, to_timestamp, lit

  **2019-12-25 13:30:00**

In [3]:
df = spark.createDataFrame([("2019-12-25 13:30:00", )], ["Christmas"])
df.show(1)


+-------------------+
|          Christmas|
+-------------------+
|2019-12-25 13:30:00|
+-------------------+



In [12]:
to_timestamp?

In [13]:
df.select(to_date("Christmas", "yyyy-MM-dd HH:mm:ss"), to_timestamp("Christmas", "yyyy-MM-dd HH:mm:ss")).show(1)

+-------------------------------------------+------------------------------------------------+
|to_date(`Christmas`, 'yyyy-MM-dd HH:mm:ss')|to_timestamp(`Christmas`, 'yyyy-MM-dd HH:mm:ss')|
+-------------------------------------------+------------------------------------------------+
|                                 2019-12-25|                             2019-12-25 13:30:00|
+-------------------------------------------+------------------------------------------------+



**25/Dec/2019 13:30:00**

In [14]:
df = spark.createDataFrame([("25/Dec/2019 13:30:00", )], ["Christmas"])
df.show(1)



+--------------------+
|           Christmas|
+--------------------+
|25/Dec/2019 13:30:00|
+--------------------+



In [15]:
df.select(to_date("Christmas", "dd/MMM/yyyy HH:mm:ss"), to_timestamp("Christmas", "dd/MMM/yyyy HH:mm:ss")).show(1)

+--------------------------------------------+-------------------------------------------------+
|to_date(`Christmas`, 'dd/MMM/yyyy HH:mm:ss')|to_timestamp(`Christmas`, 'dd/MMM/yyyy HH:mm:ss')|
+--------------------------------------------+-------------------------------------------------+
|                                  2019-12-25|                              2019-12-25 13:30:00|
+--------------------------------------------+-------------------------------------------------+



**12/25/2019 01:30:00 PM**

In [22]:
df = spark.createDataFrame([("12/25/2019 01:30:00 PM", )], ["Christmas"])
df.show(1, truncate=False)




+----------------------+
|Christmas             |
+----------------------+
|12/25/2019 01:30:00 PM|
+----------------------+



In [26]:
df.select(to_date("Christmas", "MM/dd/yyyy hh:mm:ss a"), to_timestamp("Christmas", "MM/dd/yyyy hh:mm:ss a")).show(1)


+---------------------------------------------+--------------------------------------------------+
|to_date(`Christmas`, 'MM/dd/yyyy hh:mm:ss a')|to_timestamp(`Christmas`, 'MM/dd/yyyy hh:mm:ss a')|
+---------------------------------------------+--------------------------------------------------+
|                                   2019-12-25|                               2019-12-25 13:30:00|
+---------------------------------------------+--------------------------------------------------+

