In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local[*]") \
                    .appName("Create Dataframe") \
                    .getOrCreate()

In [None]:
sales_schema = """SalesOrder string, 
                    OrderDate string, 
                    CustomerID integer, 
                    City string, 
                    Country string, 
                    ProductID integer, 
                    Price integer, 
                    Qty_Sold integer, 
                    Qty_Sold_Units string, 
                    Amount integer, 
                    Amount_Currency string, 
                    ShipDate string,
                    OrderStatus string
                    """

customer_schema = """
                    CustomerID integer,
                    CustomerName string,
                    CustomerBucket string,
                    CustomerRegion string,
                    LessThanFullTruckLoad string,
                    YearOnYearVolumeIncrease string
                """
product_schema = """
                    ProductID integer,
                    ProductName string,
                    ProductCategory string,
                    ProductSubCategory string
                """



In [None]:
sales_df = spark.read.format("csv") \
                    .option("header", True ) \
                    .schema(sales_schema) \
                    .load("./Input/1_sales_orders.csv")

customer_df = spark.read.format("csv") \
                    .option("header", True ) \
                    .schema(customer_schema) \
                    .load("./Input/2_Customer_Master_Data.csv")

product_df = spark.read.format("csv") \
                    .option("header", True ) \
                    .schema(product_schema) \
                    .load("./Input/3_product_master.csv")

In [None]:
sales_df.show()
customer_df.show()
product_df.show()

In [None]:
sales_df.repartition(1).write.format("parquet") \
               .mode("overwrite") \
               .save("./Input/Parquet/Sales")

customer_df.repartition(1).write.format("parquet") \
               .mode("overwrite") \
               .save("./Input/Parquet/Customer")

product_df.repartition(1).write.format("parquet") \
               .mode("overwrite") \
               .save("./Input/Parquet/Product")

In [None]:
sales_df.repartition(8).write.format("parquet") \
               .mode("overwrite") \
               .save("./Input/Multi_Parquet/Sales")


In [None]:
sales_df.repartition(1).write.format("orc") \
               .mode("overwrite") \
               .save("./Input/ORC/Sales")

customer_df.repartition(1).write.format("orc") \
               .mode("overwrite") \
               .save("./Input/ORC/Customer")

product_df.repartition(1).write.format("orc") \
               .mode("overwrite") \
               .save("./Input/ORC/Product")

In [None]:
sales_df.repartition(8).write.format("orc") \
               .mode("overwrite") \
               .save("./Input/Multi_ORC/Sales")


In [None]:
sales_df.write.partitionBy("City")\
                .format("parquet") \
               .mode("overwrite") \
               .save("./Input/city_partitioned/Sales")

In [None]:
sales_df.write.partitionBy("City","OrderStatus")\
                .format("csv") \
               .mode("overwrite") \
               .save("./Input/city_order_status_partitioned/Sales")