In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local[*]") \
                    .appName("Create Dataframe") \
                    .getOrCreate()


In [None]:
spark

In [None]:
import requests
url = "https://github.com/PratapBodimalla/qbex-adf-student-docs/raw/main/Datasets/DataFlowTransformations/1.SelectTransformation/2_sales_orders.json"
local_path = "/tmp/2_sales_orders.json"

# Download the file
response = requests.get(url)
with open(local_path, "wb") as file:
    file.write(response.content)

In [None]:
sales_json_df = spark.read.format("json") \
                        .load(local_path)

sales_json_df.show()

In [None]:
sales_json_df = spark.read.format("json") \
                        .option("multiline", True ) \
                        .load(local_path)

sales_json_df.show(truncate=False)

In [None]:
df1 = sales_json_df.select("OrderDate","SalesOrder","sold_to_party.*")

In [None]:
df1.select("OrderDate","SalesOrder","CustomerName","address.*").show()

In [None]:
sales_json_df.printSchema()

In [None]:
expanded_sls_df_l1 = sales_json_df.select("OrderDate","Product","SalesOrder","sales_amount.*","sales_quantities.*","sold_to_party.*")

In [None]:
expanded_sls_df_l1.show()

In [None]:
expanded_sls_df_l2 = expanded_sls_df_l1.select("OrderDate","Product","SalesOrder","Amount","Currency","Qty_Sold","Qty_Sold_Units","CustomerName","address.*")

In [None]:
expanded_sls_df_l2.show()

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, MapType, DateType, IntegerType

sales_amount_struc_type = StructType([
        StructField("Amount", IntegerType(), True),
        StructField("Currency", StringType(), True),
        ])
sales_quantities_struct_type = StructType([
        StructField("Qty_Sold", IntegerType(), True),
        StructField("Qty_Sold_Units", StringType(), True),
        ])
address_strut_type= StructType([
        StructField("City", StringType(), True),
        StructField("Country", StringType(), True),
        ])
sold_to_party_struct_type = StructType([
        StructField("CustomerName", StringType(), True),
        StructField("address", address_strut_type, True),
        ])
sales_josn_schema = StructType([
    StructField("OrderDate", DateType(), True),
    StructField("Product", StringType(), True),
    StructField("SalesOrder", StringType(), True),
    StructField("sales_amount", sales_amount_struc_type, True),
    StructField("sales_quantities", sales_quantities_struct_type, True),
    StructField("sold_to_party", sold_to_party_struct_type, True), 
])

In [None]:
sales_json_schema_df = spark.read.format("json") \
                        .option("multiline", True ) \
                        .schema(sales_josn_schema) \
                        .load(local_path)

sales_json_schema_df.show()

In [None]:
sales_json_schema_df.printSchema()

In [None]:
expanded_sls_df_sch_l1 = sales_json_schema_df.select("OrderDate","Product","SalesOrder","sales_amount.*","sales_quantities.*","sold_to_party.*")
expanded_sls_df_sch_l1.show()

In [None]:
expanded_sls_df_sch_l2 = expanded_sls_df_sch_l1.select("OrderDate","Product","SalesOrder","Amount","Currency","Qty_Sold","Qty_Sold_Units","CustomerName","address.*")
expanded_sls_df_sch_l2.show()

In [None]:
## Explode Arrays
sales_array_df = spark.read.format("json") \
                        .option("multiline", True ) \
                        .load("./Input/order_multiline.json")

sales_array_df.show()

In [None]:
sales_array_df.printSchema()

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, MapType, DateType, IntegerType, FloatType
order_line_items_struc_type = StructType([
        StructField("amount", FloatType(), True),
        StructField("item_id", StringType(), True),
        StructField("qty", FloatType(), True),
        ])

sales_josn_array_schema = StructType([
    StructField("contact", ArrayType(StringType()), True),
    StructField("customer_id", StringType(), True),
    StructField("order_id", StringType(), True),
    StructField("order_line_items", ArrayType(order_line_items_struc_type), True) 
])

In [None]:
sales_array_sch_df = spark.read.format("json") \
                        .option("multiline", True ) \
                        .schema(sales_josn_array_schema) \
                        .load("./Input/order_multiline.json")

sales_array_sch_df.show(truncate=False)

In [None]:
from pyspark.sql.functions import explode
df3 = sales_array_sch_df.withColumn("order_line_items_exp", explode(sales_array_sch_df.order_line_items))
df3.show()

In [None]:
from pyspark.sql.functions import explode
df4 = df3.withColumn("contacts", explode(sales_array_sch_df.contact)) \
         .drop("contact") \
         .drop("order_line_items")
df4.select("customer_id","order_id","order_line_items_exp.*","contacts").show()

In [None]:
sales_array_sch_df.printSchema()

In [None]:
from pyspark.sql.functions import explode
sales_array_sch_df_exp = sales_array_sch_df.withColumn("order_line_items_exploded" , explode(sales_array_sch_df.order_line_items))


In [None]:
sales_array_sch_df_exp.select("contact","customer_id","order_id","order_line_items_exploded.*").show()

In [None]:
sales_array_sch_df_exp.withColumn("contact",explode(sales_array_sch_df_exp.contact)).show()

In [None]:
# to_json , from_json

## Explode Arrays
sales_array_txt_df = spark.read.format("text") \
                        .load("./Input/order_singleline.json")

sales_array_txt_df.show(truncate=False)

In [None]:
sales_array_txt_df.printSchema()

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, MapType, DateType, IntegerType, FloatType
order_line_items_struc_type = StructType([
        StructField("amount", FloatType(), True),
        StructField("item_id", StringType(), True),
        StructField("qty", FloatType(), True),
        ])

sales_josn_array_schema = StructType([
        StructField("order_id", StringType(), True),
        StructField("customer_id", StringType(), True),
        StructField("order_line_items", ArrayType(order_line_items_struc_type), True) ,
        StructField("contact", ArrayType(StringType()), True),
])

In [None]:
# string to json parsed from_json
from pyspark.sql.functions import  from_json
sales_array_txt_df_par = sales_array_txt_df.withColumn("json_formatted",from_json(sales_array_txt_df.value, sales_josn_array_schema))
sales_array_txt_df_par.show(truncate=False)

In [None]:
#  json parsed to string to_json
from pyspark.sql.functions import to_json, from_json

sales_array_txt_df_par.withColumn("text_formatted",to_json(sales_array_txt_df_par.json_formatted)).show()