### Data Reading in Databricks


In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark DataFrames").getOrCreate()
spark

In [0]:
df=spark.read.csv("/Volumes/workspace/default/data/addresses.csv",header=True,inferSchema=True)
df2=spark.read.format('csv').option('inferSchema','true').load('/Volumes/workspace/default/data/addresses.csv')
df.show()
df2.display()
df.printSchema()


In [0]:
### Checking files
dbutils.fs.ls('/Volumes/workspace/default/data')

In [0]:
df.display()

### Reading JSON

In [0]:
df_json=spark.read.format('json').option('inferSchema',True)\
                  .option('header',True)\
                  .option('multiline',False)\
                  .load('/Volumes/workspace/default/data/drivers.json')

In [0]:
df_json.display()

### Schema Definition


In [0]:
df.printSchema()

In [0]:
df_new=spark.read.csv('/Volumes/workspace/default/data/BigMart Sales.csv',header=True,inferSchema=True)
df_new.display()
df_new.printSchema()
df_json.printSchema()

In [0]:
my_ddl_schema = '''
                   Item_Identifier string,
                   Item_Weight string,
                   Item_Fat_Content string,
                   Item_Visibility double,
                   Item_Type string,
                   Item_MRP double,
                   Outlet_Identifier string,
                   Outlet_Establishment_Year integer,
                   Outlet_Size string,
                   Outlet_Location_Type string,
                   Outlet_Type string,
                   Item_Outlet_Sales double
                '''
df_new2=spark.read.csv('/Volumes/workspace/default/data/BigMart Sales.csv',header=True,schema=my_ddl_schema)
df_new2.display()
df_new2.select('Item_Weight').show()

In [0]:
df_new2.printSchema()

### StructType() Schema

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [0]:
my_strct_schema=StructType([
    StructField('Item_Identifier',StringType(),True),
    StructField('Item_Weight',StringType(),True),
    StructField('Item_Fat_Content',StringType(),True),
    StructField('Item_Visibility',DoubleType(),True),
    StructField('Item_Type',StringType(),True),
    StructField('Item_MRP',DoubleType(),True),
    StructField('Outlet_Identifier',StringType(),True),
    StructField('Outlet_Establishment_Year',IntegerType(),True),
    StructField('Outlet_Size',StringType(),True),
    StructField('Outlet_Location_Type',StringType(),True),
    StructField('Outlet_Type',StringType(),True),
    StructField('Item_Outlet_Sales',DoubleType(),True)
])

In [0]:
df=spark.read.format('csv').option('header','true').schema(my_strct_schema).load('/Volumes/workspace/default/data/BigMart Sales.csv')

In [0]:
df.printSchema()

### Transformation in Pyspark (SELECT)

In [0]:
df_sel=df_new.select('Item_Identifier','Item_Weight','Item_Fat_Content')
df_sel.display()


### COL

In [0]:
df_sel=df_new.select(col('Item_Identifier'),col('Item_Weight'),col('Item_Fat_Content'))
df_sel.display()

### ALIAS