In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [None]:
spark = SparkSession.builder.master('local').appName('operDataFrames').enableHiveSupport().getOrCreate()

In [None]:
sc=spark.sparkContext

In [None]:
spark

In [None]:
orders = spark.read.csv('data//retail_db//orders.csv',header=True,inferSchema=True)
order_items = spark.read.csv('data//retail_db//order_items.csv',header=True,inferSchema=True)
customers = spark.read.csv('data//retail_db//customers.csv',header=True,inferSchema=True)
products = spark.read.csv('data//retail_db//products.csv',header=True,inferSchema=True)

# select

In [None]:
orders.select(orders.order_status,'order_status',col('order_status')).show()

#### But while using functions it is applicable to specify the full notation, i.e., dataframe.column_name or using col as below

In [None]:
orders.select(lower(orders.order_status),lower(col('order_status'))).show()
#Note: orders.select(lower('order_status')).show() will not work

# alias
#### alias should be enclosed within select 

### Note : Please do not give alias the same name as built-in functions 

In [None]:
orders.select(orders.order_status.alias('Status_alias')).show()
#Note: orders.select('order_status').alias('Status_alias').show()  will not work

In [None]:
orders.select(orders.order_status.alias('Status_alias')).distinct().show()
#Note: orders.select('order_status').distinct().alias('Status_alias').show() will not work

# withcolumn
#### If already existing column name is given, then the new column expression will replace the existing column. (It will not add another column)


In [None]:
orders = orders. \
withColumn('order_id',orders.order_id.cast('bigint')). \
withColumn('order_date',orders.order_date.cast('date')). \
withColumn('order_customer_id',orders.order_customer_id.cast('bigint')). \
withColumn('order_status',orders.order_status.cast('string')) 


#### Given below are the cast notations from pyspark.sql.types

In [None]:
BinaryType: binary
BooleanType: boolean
ByteType: tinyint
DateType: date
DecimalType: decimal(10,0)
DoubleType: double
FloatType: float
IntegerType: int
LongType: bigint
ShortType: smallint
StringType: string
TimestampType: timestamp

# selectExpr

#### SQL like expressions can be used for evaluation

In [None]:
orders.selectExpr('order_id||","||order_date||","||order_customer_id||","||order_status||"," as textdata ').show()

# case
#### CASE as in SQL can be used

In [None]:
orders.selectExpr('CASE WHEN order_status in ("COMPLETE","CLOSED") THEN "COMPLETELD" WHEN order_status = "CANCELED" THEN "CANCEL" ELSE "NONE" END Derived_status').show()

# when
#### Dataframe equivalent of case 

In [None]:
orders.withColumn('Derived_Status',when(orders.order_status.isin('COMPLETED','CLOSED'),'OVER'). \
                  when(orders.order_status.isin('CANCELED'),'PENDING').otherwise('NONE')).show()

# withColumnRenamed
#### columns can be renamed with this API, where the first argument is the existing column name and the second argument is the new name.

In [None]:
orders.withColumnRenamed('order_status','status_of_order').show()

# filter

#### filter as in SQL format

In [None]:
orders.filter("order_status = 'COMPLETE'").show()
#Note: Single '=' as in SQL format

#### filter as in Dataframe format

In [None]:
orders.filter(orders.order_status == 'COMPLETE').show()
#Note: Equality operator '==' for dataframe filter expression 

#### multiple filters in SQL format filter

In [None]:
orders.filter("order_status = 'COMPLETE' OR order_status = 'CLOSED'").show()

#### multiple filters in DataFrame format

In [None]:
orders.filter((orders.order_status == 'COMPLETE').__or__(orders.order_status == 'CLOSED')).show()

###### Examples of few multiple filters

In [None]:
orders.filter("order_status in ('COMPLETE' ,'CLOSED') AND date_format(order_date,'yyyyMM') = '201308'").show()

In [None]:
orders.filter((orders.order_status.isin('CLOSED','COMPLETE')).__and__(date_format(orders.order_date,'YYYYMM')=='201308')).show()

In [None]:
orders.filter('order_customer_id >= 1000').filter('order_customer_id != 1000').filter(orders.order_id.between(1000,1999)).show()

# join

#### simple inner join with one column mapping

In [None]:
orders.join(order_items, orders.order_id==order_items.order_item_order_id).show()

#### inner join with multiple columns

In [None]:
orders.join(order_items, (orders.order_id==order_items.order_item_order_id) & (orders.order_id==order_items.order_item_order_id)).show()

#### left join

In [None]:
customers.join(orders, customers.customer_id==orders.order_customer_id, 'left').show()

#### right join

In [None]:
customers.join(orders, customers.customer_id==orders.order_customer_id, 'right').show()

#### leftanti join - Join to fetch records which exists only in one table
In the below case it will fetch the records which are present only in the orders and not in order_items

In [None]:
customers.join(orders, orders.order_customer_id==customers.customer_id, 'leftanti').show()

#### crossJoin

In [None]:
orders.crossJoin(customers.filter(customers.customer_id == 1)).show()

# distinct
#### distinct will come at the end after the select

In [None]:
orders.select('order_status').distinct().show()

# countDinstinct
#### countDistinct will be handy 

In [None]:
orders.select(countDistinct('order_status')).show()

# orderBy/sort
#### Note: sort is just an alias to orderBy

In [None]:
orders.orderBy(orders.order_date, orders.order_status.desc()).show()

Note: Only absolute column name notation or col() works while mentioning in descending order desc()

In [None]:
orders.orderBy('order_date',col('order_status').desc()).show()

# drop

drop() will only take just the column names and it will not take any other expressions for the column

In [None]:
order_items.join(products,order_items.order_item_product_id==products.product_id).drop('product_price','product_description','product_image').show()

In [None]:
#order_items.join(products, order_item_product_id==products.product_id).drop(products.product_price, products.product_description, products.product_image).show()
# Specifying full name of the column will not work 