### Data Reading JSON

In [0]:
df_json = spark.read.format('json').option('inferSchema',True)\
    .option('header',True)\
        .option('multiline',False)\
            .load('/Volumes/pyspask-data-catalog/pyspark-schema/pyspark-stage/drivers.json')

In [0]:
df_json.display()

### Data Reading

In [0]:
dbutils.fs.ls('/Volumes/pyspask-data-catalog/pyspark-schema/pyspark-stage/')

In [0]:
df = spark.read.format('csv').option('inferSchema',True).option('header',True).load('/Volumes/pyspask-data-catalog/pyspark-schema/pyspark-stage/BigMart Sales.csv')

In [0]:
df.show()

In [0]:
df.display()

### Schema Definition

In [0]:
df.printSchema()

### DDL SCHEMA

In [0]:
my_ddl_schema = '''
Item_Identifier string,
Item_Weight string,
Item_Fat_Content string,
Item_Visibility double,
Item_Type string,
Item_MRP double,
Outlet_Identifier string,
Outlet_Establishment_Year int,
Outlet_Size string,
Outlet_Location_Type string,
Outlet_Type string,
Item_Outlet_Sales double'''

In [0]:
df = spark.read.format('csv')\
    .schema(my_ddl_schema)\
    .option('header',True)\
    .load('/Volumes/pyspask-data-catalog/pyspark-schema/pyspark-stage/BigMart Sales.csv')

In [0]:
df.display()

In [0]:
df.printSchema()

### StructType() Schema

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [0]:
my_strct_schema = StructType([
    StructField('Item_Identifier', StringType(), True),
    StructField('Item_Weight', StringType(), True),
    StructField('Item_Fat_Content', StringType(), True),
    StructField('Item_Visibility', StringType(), True),
    StructField('Item_Type', StringType(), True),
    StructField('Item_MRP', StringType(), True),
    StructField('Outlet_Identifier', StringType(), True),
    StructField('Outlet_Establishment_Year', StringType(), True),
    StructField('Outlet_Size', StringType(), True),
    StructField('Outlet_Location_Type', StringType(), True),
    StructField('Outlet_Type', StringType(), True),
    StructField('Item_Outlet_Sales', StringType(), True)
])

In [0]:
df = spark.read.format('csv')\
    .schema(my_strct_schema)\
        .option('header',True)\
            .load('/Volumes/pyspask-data-catalog/pyspark-schema/pyspark-stage/BigMart Sales.csv')

In [0]:
df.printSchema()

### SELECT

In [0]:
df.display()

In [0]:
df.select(col('Item_Identifier'),col('Item_Weight'),col('Item_Fat_Content')).display()

### ALIAS

In [0]:
df.select(col("Item_Identifier").alias('Item_ID')).display()

### FILTER/WHERE

In [0]:
df.display()

#### Scenario - 1

In [0]:
df.filter(col('Item_Fat_Content')=='Regular').display()

#### Scenario - 2

In [0]:
df.filter((col('Item_Type')=='Soft Drinks') & (col('Item_Weight')<10)).display()

#### Scenario - 3

In [0]:
df.filter((col('Outlet_Location_Type').isin('Tier 1','Tier 2')) & (col("Outlet_Size").isNull())).display()

### withColumnRenamed

In [0]:
df.withColumnRenamed('Item_Weight','Item_Wt').display()

### withColumn

#### Scenario - 1

In [0]:
df = df.withColumn('flag',lit('new'))

In [0]:
df.display()

In [0]:
df.withColumn('multiply',col("Item_Weight")*col("Item_MRP")).display()

#### Scenario - 2

In [0]:
df.withColumn('Item_Fat_Content',regexp_replace(col("Item_Fat_Content"),"Regular","Reg"))\
    .withColumn('Item_Fat_Content',regexp_replace(col("Item_Fat_Content"),"Low Fat","Lf"))\
    .display()

### Type Casting

In [0]:
df = df.withColumn('Item_Weight', col('Item_Weight').cast(StringType()))

In [0]:
df.printSchema()

### Sort/orderBy

#### Scenario - 1

In [0]:
df.sort(col("Item_Weight").desc()).display()

#### Scenario - 2

In [0]:
df.sort(col("Item_Visibility").asc()).display()

#### Scenario - 3

In [0]:
df.sort(['Item_Weight','Item_Visibility'],ascending = [0,0]).display()

#### Scenario - 4

In [0]:
df.sort(['Item_Weight','Item_Visibility'],ascending = [0,1]).display()

### Limit

In [0]:
df.limit(10).display()

### DROP

#### Scenario - 1

In [0]:
df.drop(col("Item_Visibility")).display()

#### Scenario - 2

In [0]:
df.drop('Item_Visibility','Item_Type').display()

### DROP_DUPLICATES

#### Scenario - 1

In [0]:
df.dropDuplicates().display()

#### Scenario - 2

In [0]:
df.drop_duplicates(subset=['Item_Type']).display()

In [0]:
df.distinct().display()