In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.master('local').appName('operDataFrames').enableHiveSupport().getOrCreate()
sc=spark.sparkContext

In [3]:
orders = spark.read.csv('data//retail_db//orders.csv',header=True,inferSchema=True)
order_items = spark.read.csv('data//retail_db//order_items.csv',header=True,inferSchema=True)
customers = spark.read.csv('data//retail_db//customers.csv',header=True,inferSchema=True)
products = spark.read.csv('data//retail_db//products.csv',header=True,inferSchema=True)

# Built-in functions 

### Most of the functions are from pyspark.sql.functions and also dataframe built-in functions

In [9]:
import pyspark.sql.functions as F
help(F)

Help on module pyspark.sql.functions in pyspark.sql:

NAME
    pyspark.sql.functions - A collections of builtin functions

CLASSES
    builtins.object
        builtins.str
        PandasUDFType
    
    class PandasUDFType(builtins.object)
     |  Pandas UDF Types. See :meth:`pyspark.sql.functions.pandas_udf`.
     |  
     |  Data descriptors defined here:
     |  
     |  __dict__
     |      dictionary for instance variables (if defined)
     |  
     |  __weakref__
     |      list of weak references to the object (if defined)
     |  
     |  ----------------------------------------------------------------------
     |  Data and other attributes defined here:
     |  
     |  GROUPED_AGG = 202
     |  
     |  GROUPED_MAP = 201
     |  
     |  SCALAR = 200
    
    basestring = class str(object)
     |  str(object='') -> str
     |  str(bytes_or_buffer[, encoding[, errors]]) -> str
     |  
     |  Create a new string object from the given object. If encoding or
     |  errors is

### Better use full notation when mentioning the column names

# substring

In [7]:
orders.select(substring('order_date',1,7).alias('Month'), substring('order_date',9,2).alias('day')).show(5)

+-------+---+
|  Month|day|
+-------+---+
|2013-07| 25|
|2013-07| 25|
|2013-07| 25|
|2013-07| 25|
|2013-07| 25|
+-------+---+
only showing top 5 rows



In [10]:
orders.select(orders.order_date.substr(1,7).alias('Month'), orders.order_date.substr(9,2).alias('Day')).show(5)

+-------+---+
|  Month|Day|
+-------+---+
|2013-07| 25|
|2013-07| 25|
|2013-07| 25|
|2013-07| 25|
|2013-07| 25|
+-------+---+
only showing top 5 rows



## substring_index

In [12]:
orders.select(substring_index('order_date','-',2)).show(5)

+---------------------------------+
|substring_index(order_date, -, 2)|
+---------------------------------+
|                          2013-07|
|                          2013-07|
|                          2013-07|
|                          2013-07|
|                          2013-07|
+---------------------------------+
only showing top 5 rows



## instr

In [15]:
orders.select(instr('order_date','07')).show(5)

+---------------------+
|instr(order_date, 07)|
+---------------------+
|                    6|
|                    6|
|                    6|
|                    6|
|                    6|
+---------------------+
only showing top 5 rows



## split

In [13]:
orders.select(split('order_date','-')[0].alias('year')).show(5)

+----+
|year|
+----+
|2013|
|2013|
|2013|
|2013|
|2013|
+----+
only showing top 5 rows



## concat

In [14]:
orders.select(concat('order_status',lit(','),'order_id')).show(5)

+---------------------------------+
|concat(order_status, ,, order_id)|
+---------------------------------+
|                         CLOSED,1|
|                PENDING_PAYMENT,2|
|                       COMPLETE,3|
|                         CLOSED,4|
|                       COMPLETE,5|
+---------------------------------+
only showing top 5 rows



## concat_ws() 
### is very important for concatenating the columns with field delimiter

In [18]:
orders.select(concat_ws('!','order_status','order_id','order_customer_id')).show(5)

+-------------------------------------------------------+
|concat_ws(!, order_status, order_id, order_customer_id)|
+-------------------------------------------------------+
|                                         CLOSED!1!11599|
|                                   PENDING_PAYMENT!2...|
|                                       COMPLETE!3!12111|
|                                          CLOSED!4!8827|
|                                       COMPLETE!5!11318|
+-------------------------------------------------------+
only showing top 5 rows



## reverse

In [20]:
orders.select(reverse('order_status')).show(5)

+---------------------+
|reverse(order_status)|
+---------------------+
|               DESOLC|
|      TNEMYAP_GNIDNEP|
|             ETELPMOC|
|               DESOLC|
|             ETELPMOC|
+---------------------+
only showing top 5 rows



## length

In [21]:
orders.select(length('order_id')).show(5)

+----------------+
|length(order_id)|
+----------------+
|               1|
|               1|
|               1|
|               1|
|               1|
+----------------+
only showing top 5 rows



# when

In [22]:
orders.select('order_id',when(orders.order_id % 2 == 0, 'EVEN').otherwise('ODD')).show(5)

+--------+-----------------------------------------------------+
|order_id|CASE WHEN ((order_id % 2) = 0) THEN EVEN ELSE ODD END|
+--------+-----------------------------------------------------+
|       1|                                                  ODD|
|       2|                                                 EVEN|
|       3|                                                  ODD|
|       4|                                                 EVEN|
|       5|                                                  ODD|
+--------+-----------------------------------------------------+
only showing top 5 rows



# repeat

In [23]:
orders.select(repeat('order_id',10)).show(5)

+--------------------+
|repeat(order_id, 10)|
+--------------------+
|          1111111111|
|          2222222222|
|          3333333333|
|          4444444444|
|          5555555555|
+--------------------+
only showing top 5 rows



## rpad

In [24]:
orders.select(rpad('order_id',15,'*')).show(5)

+---------------------+
|rpad(order_id, 15, *)|
+---------------------+
|      1**************|
|      2**************|
|      3**************|
|      4**************|
|      5**************|
+---------------------+
only showing top 5 rows



## lpad

In [26]:
orders.select(lpad('order_id',15,'*')).show(5)

+---------------------+
|lpad(order_id, 15, *)|
+---------------------+
|      **************1|
|      **************2|
|      **************3|
|      **************4|
|      **************5|
+---------------------+
only showing top 5 rows



## lower

In [27]:
orders.select(lower(orders.order_status)).show(5)

+-------------------+
|lower(order_status)|
+-------------------+
|             closed|
|    pending_payment|
|           complete|
|             closed|
|           complete|
+-------------------+
only showing top 5 rows



## upper

In [28]:
orders.select(upper(orders.order_status)).show(5)

+-------------------+
|upper(order_status)|
+-------------------+
|             CLOSED|
|    PENDING_PAYMENT|
|           COMPLETE|
|             CLOSED|
|           COMPLETE|
+-------------------+
only showing top 5 rows



## initcap

In [29]:
orders.select(initcap('order_status')).show(5)

+---------------------+
|initcap(order_status)|
+---------------------+
|               Closed|
|      Pending_payment|
|             Complete|
|               Closed|
|             Complete|
+---------------------+
only showing top 5 rows



## lit

In [31]:
orders.select(lit(0),lit('Anything')).show(5)

+---+--------+
|  0|Anything|
+---+--------+
|  0|Anything|
|  0|Anything|
|  0|Anything|
|  0|Anything|
|  0|Anything|
+---+--------+
only showing top 5 rows



## like

In [33]:
orders.filter(orders.order_id.like('44%')).show(5)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|      44|2013-07-25 00:00:00|            10500|        PENDING|
|     440|2013-07-27 00:00:00|             7290|       COMPLETE|
|     441|2013-07-27 00:00:00|             5239|PENDING_PAYMENT|
|     442|2013-07-27 00:00:00|             8098|       COMPLETE|
|     443|2013-07-27 00:00:00|             8499|         CLOSED|
+--------+-------------------+-----------------+---------------+
only showing top 5 rows



In [39]:
orders.filter(orders.order_id.like('%44')).show(5)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|      44|2013-07-25 00:00:00|            10500|        PENDING|
|     144|2013-07-26 00:00:00|             2158|     PROCESSING|
|     244|2013-07-26 00:00:00|             6910|PENDING_PAYMENT|
|     344|2013-07-26 00:00:00|             2816|        PENDING|
|     444|2013-07-27 00:00:00|            10004|       COMPLETE|
+--------+-------------------+-----------------+---------------+
only showing top 5 rows



In [40]:
orders.filter(orders.order_id.like('____4')).show(5)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|   10004|2013-09-25 00:00:00|             7768|         CLOSED|
|   10014|2013-09-25 00:00:00|            10864|SUSPECTED_FRAUD|
|   10024|2013-09-25 00:00:00|             9678|PENDING_PAYMENT|
|   10034|2013-09-25 00:00:00|             4554|         CLOSED|
|   10044|2013-09-25 00:00:00|             2333|        PENDING|
+--------+-------------------+-----------------+---------------+
only showing top 5 rows



## contains

In [35]:
orders.filter(orders.order_id.contains('444')).show(5)

+--------+-------------------+-----------------+------------+
|order_id|         order_date|order_customer_id|order_status|
+--------+-------------------+-----------------+------------+
|     444|2013-07-27 00:00:00|            10004|    COMPLETE|
|    1444|2013-08-01 00:00:00|             8302|    COMPLETE|
|    2444|2013-08-07 00:00:00|             9714|  PROCESSING|
|    3444|2013-08-14 00:00:00|             8549|    COMPLETE|
|    4440|2013-08-20 00:00:00|             5434|    COMPLETE|
+--------+-------------------+-----------------+------------+
only showing top 5 rows



## endswith

In [37]:
orders.filter(orders.order_id.endswith('444')).show(5)

+--------+-------------------+-----------------+------------+
|order_id|         order_date|order_customer_id|order_status|
+--------+-------------------+-----------------+------------+
|     444|2013-07-27 00:00:00|            10004|    COMPLETE|
|    1444|2013-08-01 00:00:00|             8302|    COMPLETE|
|    2444|2013-08-07 00:00:00|             9714|  PROCESSING|
|    3444|2013-08-14 00:00:00|             8549|    COMPLETE|
|    4444|2013-08-20 00:00:00|            11386|    COMPLETE|
+--------+-------------------+-----------------+------------+
only showing top 5 rows



## startswith

In [38]:
orders.filter(orders.order_id.startswith('444')).show(5)

+--------+-------------------+-----------------+------------+
|order_id|         order_date|order_customer_id|order_status|
+--------+-------------------+-----------------+------------+
|     444|2013-07-27 00:00:00|            10004|    COMPLETE|
|    4440|2013-08-20 00:00:00|             5434|    COMPLETE|
|    4441|2013-08-20 00:00:00|            10735|      CLOSED|
|    4442|2013-08-20 00:00:00|             6820|  PROCESSING|
|    4443|2013-08-20 00:00:00|             3817|     ON_HOLD|
+--------+-------------------+-----------------+------------+
only showing top 5 rows



## isNotNull

In [41]:
orders.filter(orders.order_id.isNotNull()).show(5)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
+--------+-------------------+-----------------+---------------+
only showing top 5 rows



## isNull

In [42]:
orders.filter(orders.order_id.isNull()).show(5)

+--------+----------+-----------------+------------+
|order_id|order_date|order_customer_id|order_status|
+--------+----------+-----------------+------------+
+--------+----------+-----------------+------------+



## isin

In [43]:
orders.filter(orders.order_id.isin('1','2','3333',798)).show(5)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|     798|2013-07-29 00:00:00|             8709|       COMPLETE|
|    3333|2013-08-12 00:00:00|            10717|         CLOSED|
+--------+-------------------+-----------------+---------------+

