In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.master('local').appName('operDataFrames').enableHiveSupport().getOrCreate()
sc=spark.sparkContext

In [None]:
orders = spark.read.csv('retail_db//orders.csv',header=True,inferSchema=True)
order_items = spark.read.csv('retail_db//order_items.csv',header=True,inferSchema=True)
customers = spark.read.csv('retail_db//customers.csv',header=True,inferSchema=True)
products = spark.read.csv('retail_db//products.csv',header=True,inferSchema=True)

# Built-in functions 

### Most of the functions are from pyspark.sql.functions and also dataframe built-in functions

In [None]:
import pyspark.sql.functions as F
help(F)

### Better use full notation when mentioning the column names

# substring

In [None]:
orders.select(substring('order_date',1,7).alias('Month'), substring('order_date',9,2).alias('day')).show(5)

In [None]:
orders.select(orders.order_date.substr(1,7).alias('Month'), orders.order_date.substr(9,2).alias('Day')).show(5)

## substring_index

In [None]:
orders.select(substring_index('order_date','-',2)).show(5)

## instr

In [None]:
orders.select(instr('order_date','07')).show(5)

## split

In [None]:
orders.select(split('order_date','-')[0].alias('year')).show(5)

## concat

In [None]:
orders.select(concat('order_status',lit(','),'order_id')).show(5)

## concat_ws() 
### is very important for concatenating the columns with field delimiter

In [None]:
orders.select(concat_ws('!','order_status','order_id','order_customer_id')).show(5)

## reverse

In [None]:
orders.select(reverse('order_status')).show(5)

## length

In [None]:
orders.select(length('order_id')).show(5)

# when

In [None]:
orders.select('order_id',when(orders.order_id % 2 == 0, 'EVEN').otherwise('ODD')).show(5)

# repeat

In [None]:
orders.select(repeat('order_id',10)).show(5)

## rpad

In [None]:
orders.select(rpad('order_id',15,'*')).show(5)

## lpad

In [None]:
orders.select(lpad('order_id',15,'*')).show(5)

## lower

In [None]:
orders.select(lower(orders.order_status)).show(5)

## upper

In [None]:
orders.select(upper(orders.order_status)).show(5)

## initcap

In [None]:
orders.select(initcap('order_status')).show(5)

## lit

In [None]:
orders.select(lit(0),lit('Anything')).show(5)

## like

In [None]:
orders.filter(orders.order_id.like('44%')).show(5)

In [None]:
orders.filter(orders.order_id.like('%44')).show(5)

In [None]:
orders.filter(orders.order_id.like('____4')).show(5)

## contains

In [None]:
orders.filter(orders.order_id.contains('444')).show(5)

## endswith

In [None]:
orders.filter(orders.order_id.endswith('444')).show(5)

## startswith

In [None]:
orders.filter(orders.order_id.startswith('444')).show(5)

## isNotNull

In [None]:
orders.filter(orders.order_id.isNotNull()).show(5)

## isNull

In [None]:
orders.filter(orders.order_id.isNull()).show(5)

## isin

In [None]:
orders.filter(orders.order_id.isin('1','2','3333',798)).show(5)