### Data Reading JSON

In [0]:
df_json = spark.read.format('json').option('inferSchema',True)\
    .option('header', True)\
        .option('multiLine',False)\
            .load('/Volumes/workspace/data/stramingjson/drivers.json')

### Data Reading

In [0]:
df = spark.read.format('csv').option('inferSchema', 'True').option('header', 'True').load('/Volumes/workspace/data/straming/BigMart Sales.csv')

In [0]:
 df.show()

In [0]:
df.display()

In [0]:
df.printSchema()

### DDL SCHEMA

In [0]:
my_ddl_schema = '''
                    Item_Identifier STRING,
                    Item_Weight STRING,
                    Item_Fat_Content STRING,
                    Item_Visibility DOUBLE,
                    Item_Type STRING, 
                    Item_MRP DOUBLE,
                    Outlet_Identifier STRING,
                    Outlet_Establishment_Year INT,
                    Outlet_Size STRING,
                    Outlet_Location_Type STRING,
                    Outlet_Type STRING,
                    Item_Outlet_Sales DOUBLE
'''

In [0]:
df = spark.read.format('csv').schema(my_ddl_schema).option('header', 'True').load('/Volumes/workspace/data/straming/BigMart Sales.csv')

In [0]:
df.display()

In [0]:
df.printSchema()

### SELECT

In [0]:
df.display()

In [0]:
df.select('Item_Identifier','Item_Weight','Item_Fat_Content','Item_Visibility').display()

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *


In [0]:
df.select(col('Item_Identifier'),col('Item_Weight')).display()

In [0]:
df.select(col('Item_Identifier').alias('Item_id')).display()

In [0]:
df.display()

####Scenario 1

In [0]:
df.filter(col('Item_Fat_Content') == 'Regular').display()

####Scenario 2

In [0]:
df.filter((col('Item_Type')=='Soft Drinks') & (col('Item_Weight')<10)).display()

####Scenario 3

In [0]:
df.filter((col('Outlet_Size').isNull()) & (col('Outlet_Location_Type').isin('Tier 1','Tier 2'))).display()

###WithColumnRenamed

In [0]:
df.withColumnRenamed('Item_Weight','Item_wt').display()

###withColumn

####Scenario1

In [0]:
df = df.withColumn('flag',lit('new'))
df.display()

In [0]:
df.withColumn('multiply',col("Item_Weight")*col("Item_MRP")).display()

####Scenario 2

In [0]:
df.withColumn('Item_Fat_Content',regexp_replace(col('Item_Fat_Content'),'Regular','Reg'))\
    .withColumn('Item_Fat_Content',regexp_replace(col('Item_Fat_Content'),'Low Fat','LF')).display()

###TypeCasting

In [0]:
df=df.withColumn("Item_Weight",col('Item_weight').cast(StringType()))

In [0]:
df.printSchema()

###sort

####scenario 1

In [0]:
df.sort(col('Item_Weight').desc()).display()

####Scenario2

In [0]:
df.sort(col('Item_Visibility').asc()).display()

####Scenario 3

In [0]:
df.sort(['Item_Weight','Item_Visibility'],ascending=[0,0]).display()

####scenario 4


In [0]:
df.sort(['Item_Weight','Item_Visibility'],descending=[0,1]).display()

###Limit

In [0]:
df.limit(10).display()

###Drop

####scenario1

In [0]:
df.drop('Item_Visibility').display()

####Scenario 2

In [0]:
df.drop('Item_Visibility','Item_Type').display()

###Drop Duplicates

In [0]:
df.dropDuplicates().display()


####Scenario 2

In [0]:
df.drop_duplicates(subset=["Item_Type"]).display()

In [0]:
df.distinct().display()

###UNION and UNION BY NAME

In [0]:
data1 = [('1','kad'),('2','sid')]
schema1 = 'id STRING, name STRING'
df1 = spark.createDataFrame(data1, schema1)

data2 = [('3','rahul'),('4','jas')]
schema2 = 'id STRING, name STRING'
df2 = spark.createDataFrame(data2, schema2)

 

In [0]:
df1.display()

In [0]:
df2.display()

###Union

In [0]:
df1.union(df2).display()


In [0]:
data1 = [('kad','1'),('sai','2')]
schema1 = 'name STRING, id STRING'
df1 = spark.createDataFrame(data1, schema1)

In [0]:
df1.union(df2).display()

###union by name

In [0]:
df1.unionByName(df2).display()

###String Function

####Initcap()


In [0]:
df.select(initcap('Item_Type')).display()

In [0]:
df.select(lower('Item_Type')).display()

In [0]:
df.display()

In [0]:
df.select(upper('Item_Type')).display()

###Date Functions

####current_date

In [0]:
df=df.withColumn('current_date',current_date())
df.display()

####date_add()

In [0]:
df=df.withColumn('week_after',date_add('current_date',7))
df.display()


###Date_sub()

In [0]:
df.withColumn('week_before',date_sub('current_date',7)).display()


In [0]:
df=df.withColumn('week_before',date_add('current_date',-7))
df.display()

###Date_Diff()

In [0]:
df=df.withColumn('datediff',datediff('current_date','week_after'))
df.display()

###Date_format()

In [0]:
df=df.withColumn('week_before',date_format('week_before','dd-MM-yyyy'))
df.display()