# DATA TRANSFORMATION FOR BEGINNER LEVEL

In [0]:
from pyspark.sql.functions import *

In [0]:
dbutils.fs.ls('dbfs:/FileStore/tables')

In [0]:
df = spark.read.format('csv').option('inferSchema', True).option('header', True).load('dbfs:/FileStore/tables/BigMart_Sales.csv')

In [0]:
df.columns

## SELECT

### Requirement: Show only the columns with names: Item_Identifier, Item_Weight, Item_Fat_Content

In [0]:
# df.select(['Item_Identifier','Item_Weight','Item_Fat_Content']).display()
df.select(col('Item_Identifier'), col('Item_Weight'), col('Item_Fat_Content')).display()

## ALIAS

### Requirement: Change the column name from Item_Identifier to Item_ID

In [0]:
df.select(col('Item_Identifier').alias('Item_ID')).display()

## FILTER / WHERE

### Requirement: Filter the data with fat content = Regular

In [0]:
df.filter(col('Item_Fat_Content') == 'Regular').display()

### Requirement: Slice the data with item type = Soft Drinks and weight < 10

In [0]:
df.filter((col('Item_Type') == 'Soft Drinks') & (col('Item_Weight') < 10)).display()

### Requirement: Fetch the data with Tier in (Tier 1 or Tier 2) and Outlet Size is Null

In [0]:
df.filter((col('Outlet_Location_Type').isin(['Tier 1', 'Tier 2']) ) & (col('Outlet_Size').isNull())).display()

## WITHCOLUMNRENAMED

In [0]:
df.withColumnRenamed('Item_Weight', 'Item_Wt').display()

## WITHCOLUMN

### Requirement: Add a new column with the help of lit()

In [0]:
df = df.withColumn('flag', lit('new'))
df.display()

In [0]:
df.withColumn('multiply', col('Item_Weight') * col('Item_MRP')).display()

### Requirement: Modify an existing column (Item_Fat_Content)

In [0]:
df.withColumn('Item_Fat_Content', regexp_replace(col('Item_Fat_Content'), 'Regular', 'Reg'))\
    .withColumn('Item_Fat_Content', regexp_replace(col('Item_Fat_Content'), 'Low Fat', 'LF'))\
    .display()

## TYPE CASTING

In [0]:
df = df.withColumn('Item_Weight', col('Item_Weight').cast(StringType()))
df.printSchema()

## SORT / ORDERBY

### Requirement: Sort the column Item_Weight in descending order

In [0]:
df.sort(col('Item_Weight').desc()).display()

### Requirement: Sort the column Item_Visibility in ascending order

In [0]:
df.sort(col('Item_Visibility').asc()).display()

### Requirement: Sort the dataframe by Item Weight and Item visibility both in descending order

In [0]:
df.sort([col('Item_Weight'), col('Item_Visibility')], ascending=[0, 0]).display()

## LIMIT

In [0]:
df.limit(10).display()

## DROP

In [0]:
df.drop('Item_Visibility').display()

In [0]:
df.drop(col('Item_Weight'), col('Item_Visibility')).display()