# DATA TRANSFORMATION FOR INTERMEDIATE LEVEL

In [0]:
from pyspark.sql.functions import *

In [0]:
df = spark.read.format('csv').option('inferSchema', True).option('header', True).load('dbfs:/FileStore/tables/BigMart_Sales.csv')

In [0]:
df.display()

## DROP_DUPLICATES

### Requirement: Remove duplicates from all the dataframe

In [0]:
df.dropDuplicates().display()

### Requirement: Remove duplicates based on Item_Type column

In [0]:
df.drop_duplicates(subset=['Item_Type']).display()

### DISTINCT = DROP DUPLICATES WITHOUT SUBSET

In [0]:
df.distinct().display()

## UNION AND UNION BY NAME

In [0]:
data1 = [('1', 'kad'), ('2', 'sid')]
schema1 = 'id string, name string'
df1 = spark.createDataFrame(data1, schema1)

data2 = [('3', 'raul'), ('4', 'jose')]
schema2 = 'id string, name string'
df2 = spark.createDataFrame(data2, schema2)

In [0]:
df3 = df1.union(df2)

In [0]:
df3.display()

### Handy when order of columns doesn't match

In [0]:
df1.unionByName(df2).display()

## STRING FUNCTIONS

### INITCAP

In [0]:
df.select(initcap(col('Item_Type'))).display()

### LOWER

In [0]:
df.select(lower(col('Item_Type'))).display()

### UPPER

In [0]:
df.select(upper(col('Item_Type'))).display()

## DATE FUNCTIONS

### CURRENT_DATE

In [0]:
df = df.withColumn('current_date', current_date())
df.display()

### DATE_ADD

In [0]:
df = df.withColumn('one_week_later', date_add('current_date', 7))
df.display()

### DATE_SUB

In [0]:
#df.withColumn('week_before', date_add('current_date', -7)).display()
df = df.withColumn('week_before', date_sub('current_date', 7))
df.display()

## DATEDIFF

In [0]:
df.withColumn('diff_in_days', datediff('one_week_later', 'current_date')).display()

## DATE_FORMAT

In [0]:
df.withColumn('week_before', date_format('week_before', 'dd-MM-yyyy')).display()

## HANDLING NULLS

### STRATEGIES FOR DROPPING NULLS

In [0]:
# df.dropna(how='all').display()
# df.dropna(how='any').display()
df.dropna(subset=['Outlet_Size']).display()

### STRATEGIES FOR FILLING NULLS

In [0]:
# df.fillna('Not Available').display()
df.fillna(0, subset=['Item_Weight']).display()

## SPLIT AND INDEXING

### SPLIT

In [0]:
df = df.withColumn('Outlet_Type', split('Outlet_Type', ' '))
df.display()

### INDEXING

In [0]:
df.withColumn('Outlet_Type', split('Outlet_Type', ' ')[1]).display()

## EXPLODE

In [0]:
df.withColumn('Outlet_Type', explode('Outlet_Type')).display()

## ARRAY_CONTAINS

In [0]:
df.withColumn('Type_1', array_contains(col('Outlet_Type'), 'Type1')).display()

## GROUP BY

In [0]:
# df.groupBy('Item_Type').agg(sum('Item_MRP').alias('sum')).display()
df.groupBy('Item_Type').agg(avg('Item_MRP').alias('avg')).display()

In [0]:
df.groupBy('Item_Type', 'Outlet_Size').agg(round(sum('Item_MRP'), 2).alias('TOTAL_MRP')).display()

In [0]:
df.groupBy('Item_Type', 'Outlet_Size').agg(round(sum('Item_MRP'), 2), round(avg('Item_MRP'),2)).display()