In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("date").getOrCreate()

In [None]:
! hadoop fs -head  /user/tkm/data/order_wd.csv

###### creating df with inferschema which will consider different date format to string

In [6]:
df1 = spark.read \
    .format("csv") \
        .option("header","true") \
        .option("inferSchema" ,"true") \
        .load("hdfs://localhost:9000/user/tkm/data/order_wd.csv")

In [7]:
df1.show(5)

+--------+----------+-----------+---------------+
|order_id|order_date|customer_id|   order_status|
+--------+----------+-----------+---------------+
|       1|  07/25/13|      11599|         CLOSED|
|       2|  07/25/13|        256|PENDING_PAYMENT|
|       3|  07/25/13|      12111|       COMPLETE|
|       4|  07/25/13|       8827|         CLOSED|
|       5|  07/25/13|      11318|       COMPLETE|
+--------+----------+-----------+---------------+
only showing top 5 rows



In [8]:
df1.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



###### creating df with schema enforcement but not able to recognise by date format because default format is yyyy-mm-dd

In [10]:
orders_schema = 'order_id long , order_date date , customer_id integer , order_status string'

In [11]:
df2 = spark.read \
    .format("csv") \
        .option("header","true") \
        .schema(orders_schema) \
        .load("hdfs://localhost:9000/user/tkm/data/order_wd.csv")

In [12]:
df2.show(4)

+--------+----------+-----------+---------------+
|order_id|order_date|customer_id|   order_status|
+--------+----------+-----------+---------------+
|       1|      null|      11599|         CLOSED|
|       2|      null|        256|PENDING_PAYMENT|
|       3|      null|      12111|       COMPLETE|
|       4|      null|       8827|         CLOSED|
+--------+----------+-----------+---------------+
only showing top 4 rows



###### creating df with schema enforcement and providing dateformat option

In [18]:
df3 = spark.read \
    .format("csv") \
        .option("header","true") \
        .schema(orders_schema) \
        .option("dateFormat" ,"MM/dd/yy") \
        .load("hdfs://localhost:9000/user/tkm/data/order_wd.csv")

In [19]:
df3.show(4)

+--------+----------+-----------+---------------+
|order_id|order_date|customer_id|   order_status|
+--------+----------+-----------+---------------+
|       1|2013-07-25|      11599|         CLOSED|
|       2|2013-07-25|        256|PENDING_PAYMENT|
|       3|2013-07-25|      12111|       COMPLETE|
|       4|2013-07-25|       8827|         CLOSED|
+--------+----------+-----------+---------------+
only showing top 4 rows



In [22]:
df3.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: date (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



###### recommended  format first import as string then convert to appropiate format with withcolumn

In [20]:
orders_schema = 'order_id long , order_date string , customer_id integer , order_status string'

In [21]:
df4 = spark.read \
    .format("csv") \
        .option("header","true") \
        .schema(orders_schema) \
        .load("hdfs://localhost:9000/user/tkm/data/order_wd.csv")

In [23]:
df4.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [24]:
df4.show(4)

+--------+----------+-----------+---------------+
|order_id|order_date|customer_id|   order_status|
+--------+----------+-----------+---------------+
|       1|  07/25/13|      11599|         CLOSED|
|       2|  07/25/13|        256|PENDING_PAYMENT|
|       3|  07/25/13|      12111|       COMPLETE|
|       4|  07/25/13|       8827|         CLOSED|
+--------+----------+-----------+---------------+
only showing top 4 rows



In [25]:
from pyspark.sql.functions import to_date

In [30]:
df5 = df4.withColumn("order_date",to_date("order_date" ,"MM/dd/yy"))

In [31]:
df5.show(4)

+--------+----------+-----------+---------------+
|order_id|order_date|customer_id|   order_status|
+--------+----------+-----------+---------------+
|       1|2013-07-25|      11599|         CLOSED|
|       2|2013-07-25|        256|PENDING_PAYMENT|
|       3|2013-07-25|      12111|       COMPLETE|
|       4|2013-07-25|       8827|         CLOSED|
+--------+----------+-----------+---------------+
only showing top 4 rows



In [32]:
df5.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: date (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [33]:
df6 = df4.withColumn("order_date_new",to_date("order_date" ,"MM/dd/yy"))

In [35]:
df6.show(5)

+--------+----------+-----------+---------------+--------------+
|order_id|order_date|customer_id|   order_status|order_date_new|
+--------+----------+-----------+---------------+--------------+
|       1|  07/25/13|      11599|         CLOSED|    2013-07-25|
|       2|  07/25/13|        256|PENDING_PAYMENT|    2013-07-25|
|       3|  07/25/13|      12111|       COMPLETE|    2013-07-25|
|       4|  07/25/13|       8827|         CLOSED|    2013-07-25|
|       5|  07/25/13|      11318|       COMPLETE|    2013-07-25|
+--------+----------+-----------+---------------+--------------+
only showing top 5 rows



In [36]:
df6.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_date_new: date (nullable = true)



In [38]:
spark.stop()