
* We typically process data in columns using functions in **pyspark.sql.functions** . Let us understand details about these functions in detail as part of this module

In [0]:
orders = spark.read.csv('/public/retail_db/orders', schema = 'order_id INT, order_date STRING, order_customer_id INT, order_status STRING')

In [0]:
orders.show()

+--------+--------------------+-----------------+---------------+
|order_id|          order_date|order_customer_id|   order_status|
+--------+--------------------+-----------------+---------------+
|       1|2013-07-25 00:00:...|            11599|         CLOSED|
|       2|2013-07-25 00:00:...|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|            12111|       COMPLETE|
|       4|2013-07-25 00:00:...|             8827|         CLOSED|
|       5|2013-07-25 00:00:...|            11318|       COMPLETE|
|       6|2013-07-25 00:00:...|             7130|       COMPLETE|
|       7|2013-07-25 00:00:...|             4530|       COMPLETE|
|       8|2013-07-25 00:00:...|             2911|     PROCESSING|
|       9|2013-07-25 00:00:...|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|             1837|         CLOSED|
|      13|

In [0]:
from pyspark.sql.functions import *

In [0]:
orders.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [0]:
help(date_format)

Help on function date_format in module pyspark.sql.functions:

date_format(date: 'ColumnOrName', format: str) -> pyspark.sql.column.Column
    Converts a date/timestamp/string to a value of string in the format specified by the date
    format given by the second argument.
    
    A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All
    pattern letters of `datetime pattern`_. can be used.
    
    .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
    
    .. versionadded:: 1.5.0
    
    Notes
    -----
    Whenever possible, use specialized functions like `year`.
    
    Examples
    --------
    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
    >>> df.select(date_format('dt', 'MM/dd/yyy').alias('date')).collect()
    [Row(date='04/08/2015')]



In [0]:
orders.select('*', date_format('order_date','yyyyMM').alias('order_month')).show()

+--------+--------------------+-----------------+---------------+-----------+
|order_id|          order_date|order_customer_id|   order_status|order_month|
+--------+--------------------+-----------------+---------------+-----------+
|       1|2013-07-25 00:00:...|            11599|         CLOSED|     201307|
|       2|2013-07-25 00:00:...|              256|PENDING_PAYMENT|     201307|
|       3|2013-07-25 00:00:...|            12111|       COMPLETE|     201307|
|       4|2013-07-25 00:00:...|             8827|         CLOSED|     201307|
|       5|2013-07-25 00:00:...|            11318|       COMPLETE|     201307|
|       6|2013-07-25 00:00:...|             7130|       COMPLETE|     201307|
|       7|2013-07-25 00:00:...|             4530|       COMPLETE|     201307|
|       8|2013-07-25 00:00:...|             2911|     PROCESSING|     201307|
|       9|2013-07-25 00:00:...|             5657|PENDING_PAYMENT|     201307|
|      10|2013-07-25 00:00:...|             5648|PENDING_PAYMENT

In [0]:
orders.withColumn('order_month', date_format('order_date', 'yyyyMM')).show()

+--------+--------------------+-----------------+---------------+-----------+
|order_id|          order_date|order_customer_id|   order_status|order_month|
+--------+--------------------+-----------------+---------------+-----------+
|       1|2013-07-25 00:00:...|            11599|         CLOSED|     201307|
|       2|2013-07-25 00:00:...|              256|PENDING_PAYMENT|     201307|
|       3|2013-07-25 00:00:...|            12111|       COMPLETE|     201307|
|       4|2013-07-25 00:00:...|             8827|         CLOSED|     201307|
|       5|2013-07-25 00:00:...|            11318|       COMPLETE|     201307|
|       6|2013-07-25 00:00:...|             7130|       COMPLETE|     201307|
|       7|2013-07-25 00:00:...|             4530|       COMPLETE|     201307|
|       8|2013-07-25 00:00:...|             2911|     PROCESSING|     201307|
|       9|2013-07-25 00:00:...|             5657|PENDING_PAYMENT|     201307|
|      10|2013-07-25 00:00:...|             5648|PENDING_PAYMENT

In [0]:
# where or filter

orders.filter(date_format('order_date','yyyyMM') == 201401).show()

+--------+--------------------+-----------------+---------------+
|order_id|          order_date|order_customer_id|   order_status|
+--------+--------------------+-----------------+---------------+
|   25876|2014-01-01 00:00:...|             3414|PENDING_PAYMENT|
|   25877|2014-01-01 00:00:...|             5549|PENDING_PAYMENT|
|   25878|2014-01-01 00:00:...|             9084|        PENDING|
|   25879|2014-01-01 00:00:...|             5118|        PENDING|
|   25880|2014-01-01 00:00:...|            10146|       CANCELED|
|   25881|2014-01-01 00:00:...|             3205|PENDING_PAYMENT|
|   25882|2014-01-01 00:00:...|             4598|       COMPLETE|
|   25883|2014-01-01 00:00:...|            11764|        PENDING|
|   25884|2014-01-01 00:00:...|             7904|PENDING_PAYMENT|
|   25885|2014-01-01 00:00:...|             7253|        PENDING|
|   25886|2014-01-01 00:00:...|             8195|     PROCESSING|
|   25887|2014-01-01 00:00:...|            10062|        PENDING|
|   25888|

In [0]:
orders.groupBy(date_format('order_date','yyyyMM').alias('order_month')).count().show()

+-----------+-----+
|order_month|count|
+-----------+-----+
|     201401| 5908|
|     201405| 5467|
|     201312| 5892|
|     201310| 5335|
|     201311| 6381|
|     201307| 1533|
|     201407| 4468|
|     201403| 5778|
|     201404| 5657|
|     201402| 5635|
|     201309| 5841|
|     201406| 5308|
|     201308| 5680|
+-----------+-----+

