In [0]:
from pyspark.sql.functions import col, explode, array_contains # For array functions
from pyspark.sql.functions import initcap, lower, upper, split # For string functions
from pyspark.sql.functions import current_date, date_add, date_sub, date_diff, date_format # For date functions
from pyspark.sql.functions import mean, sum, count, avg, max, min # For aggregate functions

### Note: The use of col() could be incorrect. Keep in mind while practicing further

## Data Loading

In [0]:
df_grocery = spark.read.format("csv")\
        .option("inferSchema", True)\
        .option("header", True)\
        .load("/Volumes/workspace/practice_data/grocery_data")

In [0]:
data1 = [('1', 'kad'), ('2', 'sid'), ('3', 'kiara')]
data2= [('4', 'justin'), ('5', 'kam')]

schema = 'id STRING, name STRING'

df1 = spark.createDataFrame(data1, schema)
df2 = spark.createDataFrame(data2, schema)

In [0]:
data3 = [('1', 'kad'), ('2', 'sid'), ('3', 'kiara')]
data2= [('justin', '4'), ('kam', '3')]

schema = 'id STRING, name STRING'
schema2 = 'name STRING, id STRING'

df3 = spark.createDataFrame(data1, schema)
df4 = spark.createDataFrame(data2, schema2)

# Transformations

## Union (concat)

In [0]:
df1.union(df2).display()

## UnionByName - Used when column order not the same

In [0]:
df3.unionByName(df4).display()


df3.union(df4).display()


## String Functions
1) INITCAP()
2) UPPER()
3) LOWER()

In [0]:
# INITCAP()

df_grocery.select(initcap(col('Item_Type'))).display()

In [0]:
#LOWER()
df_grocery.select(lower(col('Item_Type'))).display()

In [0]:
#UPPER()
df_grocery.select(upper(col('Item_Type')).alias('Upper_Type')).display()

# Date Functions
1) current_date()
2) date_add()
3) date_sub()
4) date_diff

In [0]:
#current_date

df_grocery = df_grocery.withColumn('curr_date', current_date())
df_grocery.select('curr_date').display()

In [0]:
# date_add

df_grocery = df_grocery.withColumn('week_after', date_add(col('curr_date'), 7))
df_grocery.select('curr_date', 'week_after').display()

In [0]:
# date_sub

df_grocery = df_grocery.withColumn('week_before', date_sub(col('curr_date'), 7))
df_grocery.select('curr_date', 'week_before').display()

In [0]:
# date_sub using data_add

df_grocery = df_grocery.withColumn('week_before', date_add(col('curr_date'), -7))
df_grocery.select('curr_date', 'week_before').display()

In [0]:
#datediff

df_grocery = df_grocery.withColumn('datediff', date_diff(col('week_after'), col('week_before')))
df_grocery.select('curr_date', 'week_after', 'week_before').display()

In [0]:
# date_format

df_grocery = df_grocery.withColumn('week_before', date_format(col('week_before'), 'MM-dd-yyyy'))
df_grocery.display()

## Handling nulls
1) Dropping nulls: dropna (any, all, subset)
2) Fill nulls (all, column-specific)

In [0]:
# Dropna

df_grocery.dropna(how='all').display() # All rows with null values in all columns are dropped

df_grocery.dropna(how='any').display() # All rows with null values in any column are dropped

df_grocery.dropna(subset=['Outlet_Size']).display() # All rows with null values in Outlet_Size are dropped


In [0]:
# Fill NA

# Fill all null values with 0
df_grocery.fillna(value=0).display() 

# Fill all null values in Outlet_Size with 'Unknown'
df_grocery.fillna(value={'Outlet_Size': 'Unknown'}).display() 

## Split and Indexing

In [0]:
#df_grocery.withColumn('Outlet_Type', split(col('Outlet_Type'), ' ')).display()

df_grocery.withColumn('Store_Type', split(col('Outlet_Type'), ' ')[0])
df_grocery.withColumn('Type', split(col('Outlet_Type'), ' ')[1])

## Explode

In [0]:
df_exp = df_grocery.withColumn('Outlet_Type', split(col('Outlet_Type'), ' '))
df_exp.display()

In [0]:
df_exp.withColumn('Outlet_Type', explode(col('Outlet_Type'))).display()

## Array_Contains

In [0]:
df_exp = df_exp.withColumn('Type1_flag', array_contains(col('Outlet_Type'), 'Type1'))
df_exp.display()

## GroupBy
1) group by one column, and calculate sum of another column
2) group by one column, and calculate avg of another column
3) groupby two columns, then calculate sum of another column
4) groupby two columns, then calculate sum and avg of another column

In [0]:
# Scenario 1

df_grocery.groupBy('Item_Type').agg(sum(col('Item_MRP'))).display()

In [0]:
# Scenario 2

df_grocery.groupBy('Item_Type').agg(avg(col('Item_MRP'))).display()

In [0]:
df_grocery.groupBy('Item_Type', 'Outlet_Size').agg(avg(col('Item_MRP'))).display()

In [0]:
df_grocery.groupBy('Item_Type', 'Outlet_Size').agg(
    avg(col('Item_MRP'))
).display()