In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.master('local').appName('operDataFrames').enableHiveSupport().getOrCreate()

In [3]:
sc=spark.sparkContext

In [4]:
spark

In [49]:
orders = spark.read.csv('data//retail_db//orders.csv',header=True,inferSchema=True)
order_items = spark.read.csv('data//retail_db//order_items.csv',header=True,inferSchema=True)
customers = spark.read.csv('data//retail_db//customers.csv',header=True,inferSchema=True)
products = spark.read.csv('data//retail_db//products.csv',header=True,inferSchema=True)

# select

In [6]:
orders.select(orders.order_status,'order_status',col('order_status')).show()

+---------------+---------------+---------------+
|   order_status|   order_status|   order_status|
+---------------+---------------+---------------+
|         CLOSED|         CLOSED|         CLOSED|
|PENDING_PAYMENT|PENDING_PAYMENT|PENDING_PAYMENT|
|       COMPLETE|       COMPLETE|       COMPLETE|
|         CLOSED|         CLOSED|         CLOSED|
|       COMPLETE|       COMPLETE|       COMPLETE|
|       COMPLETE|       COMPLETE|       COMPLETE|
|       COMPLETE|       COMPLETE|       COMPLETE|
|     PROCESSING|     PROCESSING|     PROCESSING|
|PENDING_PAYMENT|PENDING_PAYMENT|PENDING_PAYMENT|
|PENDING_PAYMENT|PENDING_PAYMENT|PENDING_PAYMENT|
| PAYMENT_REVIEW| PAYMENT_REVIEW| PAYMENT_REVIEW|
|         CLOSED|         CLOSED|         CLOSED|
|PENDING_PAYMENT|PENDING_PAYMENT|PENDING_PAYMENT|
|     PROCESSING|     PROCESSING|     PROCESSING|
|       COMPLETE|       COMPLETE|       COMPLETE|
|PENDING_PAYMENT|PENDING_PAYMENT|PENDING_PAYMENT|
|       COMPLETE|       COMPLETE|       COMPLETE|


#### But while using functions it is applicable to specify the full notation, i.e., dataframe.column_name or using col as below

In [7]:
orders.select(lower(orders.order_status),lower(col('order_status'))).show()
#Note: orders.select(lower('order_status')).show() will not work

+-------------------+-------------------+
|lower(order_status)|lower(order_status)|
+-------------------+-------------------+
|             closed|             closed|
|    pending_payment|    pending_payment|
|           complete|           complete|
|             closed|             closed|
|           complete|           complete|
|           complete|           complete|
|           complete|           complete|
|         processing|         processing|
|    pending_payment|    pending_payment|
|    pending_payment|    pending_payment|
|     payment_review|     payment_review|
|             closed|             closed|
|    pending_payment|    pending_payment|
|         processing|         processing|
|           complete|           complete|
|    pending_payment|    pending_payment|
|           complete|           complete|
|             closed|             closed|
|    pending_payment|    pending_payment|
|         processing|         processing|
+-------------------+-------------

# alias
#### alias should be enclosed within select 

### Note : Please do not give alias the same name as built-in functions 

In [8]:
orders.select(orders.order_status.alias('Status_alias')).show()
#Note: orders.select('order_status').alias('Status_alias').show()  will not work

+---------------+
|   Status_alias|
+---------------+
|         CLOSED|
|PENDING_PAYMENT|
|       COMPLETE|
|         CLOSED|
|       COMPLETE|
|       COMPLETE|
|       COMPLETE|
|     PROCESSING|
|PENDING_PAYMENT|
|PENDING_PAYMENT|
| PAYMENT_REVIEW|
|         CLOSED|
|PENDING_PAYMENT|
|     PROCESSING|
|       COMPLETE|
|PENDING_PAYMENT|
|       COMPLETE|
|         CLOSED|
|PENDING_PAYMENT|
|     PROCESSING|
+---------------+
only showing top 20 rows



In [9]:
orders.select(orders.order_status.alias('Status_alias')).distinct().show()
#Note: orders.select('order_status').distinct().alias('Status_alias').show() will not work

+---------------+
|   Status_alias|
+---------------+
|PENDING_PAYMENT|
|       COMPLETE|
|        ON_HOLD|
| PAYMENT_REVIEW|
|     PROCESSING|
|         CLOSED|
|SUSPECTED_FRAUD|
|        PENDING|
|       CANCELED|
+---------------+



# withcolumn
#### If already existing column name is given, then the new column expression will replace the existing column. (It will not add another column)


In [10]:
orders = orders. \
withColumn('order_id',orders.order_id.cast('bigint')). \
withColumn('order_date',orders.order_date.cast('date')). \
withColumn('order_customer_id',orders.order_customer_id.cast('bigint')). \
withColumn('order_status',orders.order_status.cast('string')) 


#### Given below are the cast notations from pyspark.sql.types

In [None]:
BinaryType: binary
BooleanType: boolean
ByteType: tinyint
DateType: date
DecimalType: decimal(10,0)
DoubleType: double
FloatType: float
IntegerType: int
LongType: bigint
ShortType: smallint
StringType: string
TimestampType: timestamp

# selectExpr

#### SQL like expressions can be used for evaluation

In [11]:
orders.selectExpr('order_id||","||order_date||","||order_customer_id||","||order_status||"," as textdata ').show()

+--------------------+
|            textdata|
+--------------------+
|1,2013-07-25,1159...|
|2,2013-07-25,256,...|
|3,2013-07-25,1211...|
|4,2013-07-25,8827...|
|5,2013-07-25,1131...|
|6,2013-07-25,7130...|
|7,2013-07-25,4530...|
|8,2013-07-25,2911...|
|9,2013-07-25,5657...|
|10,2013-07-25,564...|
|11,2013-07-25,918...|
|12,2013-07-25,183...|
|13,2013-07-25,914...|
|14,2013-07-25,984...|
|15,2013-07-25,256...|
|16,2013-07-25,727...|
|17,2013-07-25,266...|
|18,2013-07-25,120...|
|19,2013-07-25,948...|
|20,2013-07-25,919...|
+--------------------+
only showing top 20 rows



# case
#### CASE as in SQL can be used

In [16]:
orders.selectExpr('CASE WHEN order_status in ("COMPLETE","CLOSED") THEN "COMPLETELD" WHEN order_status = "CANCELED" THEN "CANCEL" ELSE "NONE" END Derived_status').show()

+--------------+
|Derived_status|
+--------------+
|    COMPLETELD|
|          NONE|
|    COMPLETELD|
|    COMPLETELD|
|    COMPLETELD|
|    COMPLETELD|
|    COMPLETELD|
|          NONE|
|          NONE|
|          NONE|
|          NONE|
|    COMPLETELD|
|          NONE|
|          NONE|
|    COMPLETELD|
|          NONE|
|    COMPLETELD|
|    COMPLETELD|
|          NONE|
|          NONE|
+--------------+
only showing top 20 rows



# when
#### Dataframe equivalent of case 

In [18]:
orders.withColumn('Derived_Status',when(orders.order_status.isin('COMPLETED','CLOSED'),'OVER'). \
                  when(orders.order_status.isin('CANCELED'),'PENDING').otherwise('NONE')).show()

+--------+----------+-----------------+---------------+--------------+
|order_id|order_date|order_customer_id|   order_status|Derived_Status|
+--------+----------+-----------------+---------------+--------------+
|       1|2013-07-25|            11599|         CLOSED|          OVER|
|       2|2013-07-25|              256|PENDING_PAYMENT|          NONE|
|       3|2013-07-25|            12111|       COMPLETE|          NONE|
|       4|2013-07-25|             8827|         CLOSED|          OVER|
|       5|2013-07-25|            11318|       COMPLETE|          NONE|
|       6|2013-07-25|             7130|       COMPLETE|          NONE|
|       7|2013-07-25|             4530|       COMPLETE|          NONE|
|       8|2013-07-25|             2911|     PROCESSING|          NONE|
|       9|2013-07-25|             5657|PENDING_PAYMENT|          NONE|
|      10|2013-07-25|             5648|PENDING_PAYMENT|          NONE|
|      11|2013-07-25|              918| PAYMENT_REVIEW|          NONE|
|     

# withColumnRenamed
#### columns can be renamed with this API, where the first argument is the existing column name and the second argument is the new name.

In [20]:
orders.withColumnRenamed('order_status','status_of_order').show()

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|status_of_order|
+--------+----------+-----------------+---------------+
|       1|2013-07-25|            11599|         CLOSED|
|       2|2013-07-25|              256|PENDING_PAYMENT|
|       3|2013-07-25|            12111|       COMPLETE|
|       4|2013-07-25|             8827|         CLOSED|
|       5|2013-07-25|            11318|       COMPLETE|
|       6|2013-07-25|             7130|       COMPLETE|
|       7|2013-07-25|             4530|       COMPLETE|
|       8|2013-07-25|             2911|     PROCESSING|
|       9|2013-07-25|             5657|PENDING_PAYMENT|
|      10|2013-07-25|             5648|PENDING_PAYMENT|
|      11|2013-07-25|              918| PAYMENT_REVIEW|
|      12|2013-07-25|             1837|         CLOSED|
|      13|2013-07-25|             9149|PENDING_PAYMENT|
|      14|2013-07-25|             9842|     PROCESSING|
|      15|2013-07-25|             2568|       CO

# filter

#### filter as in SQL format

In [21]:
orders.filter("order_status = 'COMPLETE'").show()
#Note: Single '=' as in SQL format

+--------+----------+-----------------+------------+
|order_id|order_date|order_customer_id|order_status|
+--------+----------+-----------------+------------+
|       3|2013-07-25|            12111|    COMPLETE|
|       5|2013-07-25|            11318|    COMPLETE|
|       6|2013-07-25|             7130|    COMPLETE|
|       7|2013-07-25|             4530|    COMPLETE|
|      15|2013-07-25|             2568|    COMPLETE|
|      17|2013-07-25|             2667|    COMPLETE|
|      22|2013-07-25|              333|    COMPLETE|
|      26|2013-07-25|             7562|    COMPLETE|
|      28|2013-07-25|              656|    COMPLETE|
|      32|2013-07-25|             3960|    COMPLETE|
|      35|2013-07-25|             4840|    COMPLETE|
|      45|2013-07-25|             2636|    COMPLETE|
|      56|2013-07-25|            10519|    COMPLETE|
|      63|2013-07-25|             1148|    COMPLETE|
|      65|2013-07-25|             5903|    COMPLETE|
|      67|2013-07-25|             1406|    COM

#### filter as in Dataframe format

In [22]:
orders.filter(orders.order_status == 'COMPLETE').show()
#Note: Equality operator '==' for dataframe filter expression 

+--------+----------+-----------------+------------+
|order_id|order_date|order_customer_id|order_status|
+--------+----------+-----------------+------------+
|       3|2013-07-25|            12111|    COMPLETE|
|       5|2013-07-25|            11318|    COMPLETE|
|       6|2013-07-25|             7130|    COMPLETE|
|       7|2013-07-25|             4530|    COMPLETE|
|      15|2013-07-25|             2568|    COMPLETE|
|      17|2013-07-25|             2667|    COMPLETE|
|      22|2013-07-25|              333|    COMPLETE|
|      26|2013-07-25|             7562|    COMPLETE|
|      28|2013-07-25|              656|    COMPLETE|
|      32|2013-07-25|             3960|    COMPLETE|
|      35|2013-07-25|             4840|    COMPLETE|
|      45|2013-07-25|             2636|    COMPLETE|
|      56|2013-07-25|            10519|    COMPLETE|
|      63|2013-07-25|             1148|    COMPLETE|
|      65|2013-07-25|             5903|    COMPLETE|
|      67|2013-07-25|             1406|    COM

#### multiple filters in SQL format filter

In [23]:
orders.filter("order_status = 'COMPLETE' OR order_status = 'CLOSED'").show()

+--------+----------+-----------------+------------+
|order_id|order_date|order_customer_id|order_status|
+--------+----------+-----------------+------------+
|       1|2013-07-25|            11599|      CLOSED|
|       3|2013-07-25|            12111|    COMPLETE|
|       4|2013-07-25|             8827|      CLOSED|
|       5|2013-07-25|            11318|    COMPLETE|
|       6|2013-07-25|             7130|    COMPLETE|
|       7|2013-07-25|             4530|    COMPLETE|
|      12|2013-07-25|             1837|      CLOSED|
|      15|2013-07-25|             2568|    COMPLETE|
|      17|2013-07-25|             2667|    COMPLETE|
|      18|2013-07-25|             1205|      CLOSED|
|      22|2013-07-25|              333|    COMPLETE|
|      24|2013-07-25|            11441|      CLOSED|
|      25|2013-07-25|             9503|      CLOSED|
|      26|2013-07-25|             7562|    COMPLETE|
|      28|2013-07-25|              656|    COMPLETE|
|      32|2013-07-25|             3960|    COM

#### multiple filters in DataFrame format

In [24]:
orders.filter((orders.order_status == 'COMPLETE').__or__(orders.order_status == 'CLOSED')).show()

+--------+----------+-----------------+------------+
|order_id|order_date|order_customer_id|order_status|
+--------+----------+-----------------+------------+
|       1|2013-07-25|            11599|      CLOSED|
|       3|2013-07-25|            12111|    COMPLETE|
|       4|2013-07-25|             8827|      CLOSED|
|       5|2013-07-25|            11318|    COMPLETE|
|       6|2013-07-25|             7130|    COMPLETE|
|       7|2013-07-25|             4530|    COMPLETE|
|      12|2013-07-25|             1837|      CLOSED|
|      15|2013-07-25|             2568|    COMPLETE|
|      17|2013-07-25|             2667|    COMPLETE|
|      18|2013-07-25|             1205|      CLOSED|
|      22|2013-07-25|              333|    COMPLETE|
|      24|2013-07-25|            11441|      CLOSED|
|      25|2013-07-25|             9503|      CLOSED|
|      26|2013-07-25|             7562|    COMPLETE|
|      28|2013-07-25|              656|    COMPLETE|
|      32|2013-07-25|             3960|    COM

###### Examples of few multiple filters

In [25]:
orders.filter("order_status in ('COMPLETE' ,'CLOSED') AND date_format(order_date,'yyyyMM') = '201308'").show()

+--------+----------+-----------------+------------+
|order_id|order_date|order_customer_id|order_status|
+--------+----------+-----------------+------------+
|    1297|2013-08-01|            11607|    COMPLETE|
|    1298|2013-08-01|             5105|      CLOSED|
|    1299|2013-08-01|             7802|    COMPLETE|
|    1302|2013-08-01|             1695|    COMPLETE|
|    1304|2013-08-01|             2059|    COMPLETE|
|    1305|2013-08-01|             3844|    COMPLETE|
|    1307|2013-08-01|             4474|    COMPLETE|
|    1309|2013-08-01|             2367|      CLOSED|
|    1312|2013-08-01|            12291|    COMPLETE|
|    1314|2013-08-01|            10993|    COMPLETE|
|    1315|2013-08-01|             5660|    COMPLETE|
|    1318|2013-08-01|             4212|    COMPLETE|
|    1319|2013-08-01|             3966|    COMPLETE|
|    1320|2013-08-01|            12270|    COMPLETE|
|    1321|2013-08-01|              800|    COMPLETE|
|    1322|2013-08-01|             9264|    COM

In [26]:
orders.filter((orders.order_status.isin('CLOSED','COMPLETE')).__and__(date_format(orders.order_date,'YYYYMM')=='201308')).show()

+--------+----------+-----------------+------------+
|order_id|order_date|order_customer_id|order_status|
+--------+----------+-----------------+------------+
|    1297|2013-08-01|            11607|    COMPLETE|
|    1298|2013-08-01|             5105|      CLOSED|
|    1299|2013-08-01|             7802|    COMPLETE|
|    1302|2013-08-01|             1695|    COMPLETE|
|    1304|2013-08-01|             2059|    COMPLETE|
|    1305|2013-08-01|             3844|    COMPLETE|
|    1307|2013-08-01|             4474|    COMPLETE|
|    1309|2013-08-01|             2367|      CLOSED|
|    1312|2013-08-01|            12291|    COMPLETE|
|    1314|2013-08-01|            10993|    COMPLETE|
|    1315|2013-08-01|             5660|    COMPLETE|
|    1318|2013-08-01|             4212|    COMPLETE|
|    1319|2013-08-01|             3966|    COMPLETE|
|    1320|2013-08-01|            12270|    COMPLETE|
|    1321|2013-08-01|              800|    COMPLETE|
|    1322|2013-08-01|             9264|    COM

In [27]:
orders.filter('order_customer_id >= 1000').filter('order_customer_id != 1000').filter(orders.order_id.between(1000,1999)).show()

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|    1000|2013-07-30|             2321|         CLOSED|
|    1001|2013-07-30|             6650|       COMPLETE|
|    1002|2013-07-30|             5170|SUSPECTED_FRAUD|
|    1003|2013-07-30|             7438|       COMPLETE|
|    1004|2013-07-30|            10701|       COMPLETE|
|    1005|2013-07-30|             7958|     PROCESSING|
|    1006|2013-07-30|             9028|       COMPLETE|
|    1007|2013-07-30|             1417|PENDING_PAYMENT|
|    1008|2013-07-30|             4331|         CLOSED|
|    1009|2013-07-30|             7886|       COMPLETE|
|    1010|2013-07-30|             2512|       COMPLETE|
|    1011|2013-07-30|             7214|       COMPLETE|
|    1012|2013-07-30|             3245|         CLOSED|
|    1013|2013-07-30|             1903|       CANCELED|
|    1014|2013-07-30|            12077|     PROC

# join

#### simple inner join with one column mapping

In [28]:
orders.join(order_items, orders.order_id==order_items.order_item_order_id).show()

+--------+----------+-----------------+---------------+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+
|order_id|order_date|order_customer_id|   order_status|order_item_id|order_item_order_id|order_item_product_id|order_item_quantity|order_item_subtotal|order_item_product_price|
+--------+----------+-----------------+---------------+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+
|       1|2013-07-25|            11599|         CLOSED|            1|                  1|                  957|                  1|             299.98|                  299.98|
|       2|2013-07-25|              256|PENDING_PAYMENT|            2|                  2|                 1073|                  1|             199.99|                  199.99|
|       2|2013-07-25|              256|PENDING_PAYMENT|            3|                  2|                  502|    

#### inner join with multiple columns

In [29]:
orders.join(order_items, (orders.order_id==order_items.order_item_order_id) & (orders.order_id==order_items.order_item_order_id)).show()

+--------+----------+-----------------+---------------+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+
|order_id|order_date|order_customer_id|   order_status|order_item_id|order_item_order_id|order_item_product_id|order_item_quantity|order_item_subtotal|order_item_product_price|
+--------+----------+-----------------+---------------+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+
|       1|2013-07-25|            11599|         CLOSED|            1|                  1|                  957|                  1|             299.98|                  299.98|
|       2|2013-07-25|              256|PENDING_PAYMENT|            2|                  2|                 1073|                  1|             199.99|                  199.99|
|       2|2013-07-25|              256|PENDING_PAYMENT|            3|                  2|                  502|    

#### left join

In [33]:
customers.join(orders, customers.customer_id==orders.order_customer_id, 'left').show()

+-----------+--------------+--------------+--------------+-----------------+--------------------+-------------+--------------+----------------+--------+-------------------+-----------------+---------------+
|customer_id|customer_fname|customer_lname|customer_email|customer_password|     customer_street|customer_city|customer_state|customer_zipcode|order_id|         order_date|order_customer_id|   order_status|
+-----------+--------------+--------------+--------------+-----------------+--------------------+-------------+--------------+----------------+--------+-------------------+-----------------+---------------+
|          1|       Richard|     Hernandez|     XXXXXXXXX|        XXXXXXXXX|  6303 Heather Plaza|  Brownsville|            TX|           78521|   22945|2013-12-13 00:00:00|                1|       COMPLETE|
|          2|          Mary|       Barrett|     XXXXXXXXX|        XXXXXXXXX|9526 Noble Embers...|    Littleton|            CO|           80126|   67863|2013-11-30 00:00:00|

#### right join

In [34]:
customers.join(orders, customers.customer_id==orders.order_customer_id, 'right').show()

+-----------+--------------+--------------+--------------+-----------------+--------------------+-------------+--------------+----------------+--------+-------------------+-----------------+---------------+
|customer_id|customer_fname|customer_lname|customer_email|customer_password|     customer_street|customer_city|customer_state|customer_zipcode|order_id|         order_date|order_customer_id|   order_status|
+-----------+--------------+--------------+--------------+-----------------+--------------------+-------------+--------------+----------------+--------+-------------------+-----------------+---------------+
|      11599|          Mary|        Malone|     XXXXXXXXX|        XXXXXXXXX|8708 Indian Horse...|      Hickory|            NC|           28601|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|        256|         David|     Rodriguez|     XXXXXXXXX|        XXXXXXXXX|7605 Tawny Horse ...|      Chicago|            IL|           60625|       2|2013-07-25 00:00:00|

#### leftanti join - Join to fetch records which exists only in one table
In the below case it will fetch the records which are present only in the orders and not in order_items

In [36]:
customers.join(orders, orders.order_customer_id==customers.customer_id, 'leftanti').show()

+-----------+--------------+--------------+--------------+-----------------+--------------------+-------------------+--------------+----------------+
|customer_id|customer_fname|customer_lname|customer_email|customer_password|     customer_street|      customer_city|customer_state|customer_zipcode|
+-----------+--------------+--------------+--------------+-----------------+--------------------+-------------------+--------------+----------------+
|        219|          Mary|       Harrell|     XXXXXXXXX|        XXXXXXXXX|9016 Foggy Robin ...|             Denver|            CO|           80219|
|        339|          Mary|        Greene|     XXXXXXXXX|        XXXXXXXXX|     4271 Hazy Close|         Long Beach|            CA|           90805|
|        469|         Randy|         Smith|     XXXXXXXXX|        XXXXXXXXX|252 Golden Goose ...|South San Francisco|            CA|           94080|
|       1187|       Dorothy|       Vazquez|     XXXXXXXXX|        XXXXXXXXX| 363 Green Goose Run|   

#### crossJoin

In [37]:
orders.crossJoin(customers.filter(customers.customer_id == 1)).show()

+--------+-------------------+-----------------+---------------+-----------+--------------+--------------+--------------+-----------------+------------------+-------------+--------------+----------------+
|order_id|         order_date|order_customer_id|   order_status|customer_id|customer_fname|customer_lname|customer_email|customer_password|   customer_street|customer_city|customer_state|customer_zipcode|
+--------+-------------------+-----------------+---------------+-----------+--------------+--------------+--------------+-----------------+------------------+-------------+--------------+----------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|          1|       Richard|     Hernandez|     XXXXXXXXX|        XXXXXXXXX|6303 Heather Plaza|  Brownsville|            TX|           78521|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|          1|       Richard|     Hernandez|     XXXXXXXXX|        XXXXXXXXX|6303 Heather Plaza|  Brownsville|        

# distinct
#### distinct will come at the end after the select

In [38]:
orders.select('order_status').distinct().show()

+---------------+
|   order_status|
+---------------+
|PENDING_PAYMENT|
|       COMPLETE|
|        ON_HOLD|
| PAYMENT_REVIEW|
|     PROCESSING|
|         CLOSED|
|SUSPECTED_FRAUD|
|        PENDING|
|       CANCELED|
+---------------+



# countDinstinct
#### countDistinct will be handy 

In [40]:
orders.select(countDistinct('order_status')).show()

+----------------------------+
|count(DISTINCT order_status)|
+----------------------------+
|                           9|
+----------------------------+



# orderBy/sort
#### Note: sort is just an alias to orderBy

In [43]:
orders.orderBy(orders.order_date, orders.order_status.desc()).show()

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|      69|2013-07-25 00:00:00|             2821|SUSPECTED_FRAUD|
|   57770|2013-07-25 00:00:00|             7451|SUSPECTED_FRAUD|
|      20|2013-07-25 00:00:00|             9198|     PROCESSING|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|      84|2013-07-25 00:00:00|             6789|     PROCESSING|
|      34|2013-07-25 00:00:00|             4189|     PROCESSING|
|   57765|2013-07-25 00:00:00|             2876|     PROCESSING|
|      38|2013-07-25 00:00:00|            11586|     PROCESSING|
|      29|2013-07-25 00:00:00|              196|     PROCESSING|
|      81|2013-07-25 00:00:00|              674|     PROCESSING|
|      94|2013-07-25 00:00:00|            11589|     PROCESSING|
|     100|2013-07-25 00:00:00|            12131|     PROCESSING|
|     103|2013-07-25 00:0

Note: Only absolute column name notation or col() works while mentioning in descending order desc()

In [44]:
orders.orderBy('order_date',col('order_status').desc()).show()

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|      69|2013-07-25 00:00:00|             2821|SUSPECTED_FRAUD|
|   57770|2013-07-25 00:00:00|             7451|SUSPECTED_FRAUD|
|      20|2013-07-25 00:00:00|             9198|     PROCESSING|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|      84|2013-07-25 00:00:00|             6789|     PROCESSING|
|      34|2013-07-25 00:00:00|             4189|     PROCESSING|
|   57765|2013-07-25 00:00:00|             2876|     PROCESSING|
|      38|2013-07-25 00:00:00|            11586|     PROCESSING|
|      29|2013-07-25 00:00:00|              196|     PROCESSING|
|      81|2013-07-25 00:00:00|              674|     PROCESSING|
|      94|2013-07-25 00:00:00|            11589|     PROCESSING|
|     100|2013-07-25 00:00:00|            12131|     PROCESSING|
|     103|2013-07-25 00:0

# drop

drop() will only take just the column names and it will not take any other expressions for the column

In [50]:
order_items.join(products,order_items.order_item_product_id==products.product_id).drop('product_price','product_description','product_image').show()

+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+----------+-------------------+--------------------+
|order_item_id|order_item_order_id|order_item_product_id|order_item_quantity|order_item_subtotal|order_item_product_price|product_id|product_category_id|        product_name|
+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+----------+-------------------+--------------------+
|            1|                  1|                  957|                  1|             299.98|                  299.98|       957|                 43|Diamondback Women...|
|            2|                  2|                 1073|                  1|             199.99|                  199.99|      1073|                 48|Pelican Sunstream...|
|            3|                  2|                  502|                  5|              250.0|                    50.0|   

In [52]:
#order_items.join(products, order_item_product_id==products.product_id).drop(products.product_price, products.product_description, products.product_image).show()
# Specifying full name of the column will not work 