In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local[*]") \
                    .appName("Create Dataframe") \
                    .getOrCreate()

In [2]:
spark

In [3]:
sales_schema = """SalesOrder string, 
                    OrderDate string, 
                    CustomerID integer, 
                    City string, 
                    Country string, 
                    ProductID integer, 
                    Price integer, 
                    Qty_Sold integer, 
                    Qty_Sold_Units string, 
                    Amount integer, 
                    Amount_Currency string, 
                    ShipDate string,
                    OrderStatus string
                    """

customer_schema = """
                    CustomerID integer,
                    CustomerName string,
                    CustomerBucket string,
                    CustomerRegion string,
                    LessThanFullTruckLoad string,
                    YearOnYearVolumeIncrease string
                """
product_schema = """
                    ProductID integer,
                    ProductName string,
                    ProductCategory string,
                    ProductSubCategory string
                """



In [8]:
sales_df = spark.read.format("csv") \
                    .option("header", True ) \
                    .schema(sales_schema) \
                    .load("../04 - Read Data from Files/Input/1_sales_orders.csv")

customer_df = spark.read.format("csv") \
                    .option("header", True ) \
                    .schema(customer_schema) \
                    .load("../04 - Read Data from Files/Input/2_Customer_Master_Data.csv")

product_df = spark.read.format("csv") \
                    .option("header", True ) \
                    .schema(product_schema) \
                    .load("../04 - Read Data from Files/Input/3_product_master.csv")

In [9]:
sales_df.show()
customer_df.show()
product_df.show()

+----------+----------+----------+---------+-------+---------+-----+--------+--------------+------+---------------+----------+-----------+
|SalesOrder| OrderDate|CustomerID|     City|Country|ProductID|Price|Qty_Sold|Qty_Sold_Units|Amount|Amount_Currency|  ShipDate|OrderStatus|
+----------+----------+----------+---------+-------+---------+-----+--------+--------------+------+---------------+----------+-----------+
|     SO101|08-01-2024|         1|Hyderabad|  India|        1| 3000|       4|        pieces| 12000|            INR|18-01-2024|       Open|
|     SO102|19-01-2024|         2|   Mumbai|  India|        2| 1500|       5|        pieces|  7500|            INR|29-01-2024|       Open|
|     SO103|01-01-2024|         3|     Pune|  India|        3| 1500|      10|        pieces| 15000|            INR|11-01-2024|     Closed|
|     SO104|30-03-2024|         2|     Pune|  India|        4|10000|       8|        pieces| 80000|            INR|09-04-2024|     Closed|
|     SO105|14-03-2024|    

In [13]:
customer_df.write.format("csv") \
                 .option("header", True) \
                 .mode("overwrite") \
                 .save("./files/customer.csv")

In [16]:
sales_df.rdd.getNumPartitions()

1

In [17]:
sales_part_df = sales_df.repartition(8)

In [18]:
sales_part_df.rdd.getNumPartitions()

8

In [19]:
sales_part_df.write.format("csv") \
                 .option("header", True) \
                 .mode("overwrite") \
                 .save("./files/sales")

In [20]:
from pyspark.sql.functions import spark_partition_id
sales_part_df.withColumn("part_colum",spark_partition_id()).show()

+----------+----------+----------+---------+-------+---------+-----+--------+--------------+------+---------------+----------+-----------+----------+
|SalesOrder| OrderDate|CustomerID|     City|Country|ProductID|Price|Qty_Sold|Qty_Sold_Units|Amount|Amount_Currency|  ShipDate|OrderStatus|part_colum|
+----------+----------+----------+---------+-------+---------+-----+--------+--------------+------+---------------+----------+-----------+----------+
|     SO108|03-04-2024|         4|     Pune|  India|        5| 8000|       1|        pieces|  8000|            INR|08-04-2024|       Open|         0|
|    SO1029|17-09-2023|         2|  Chennai|  India|        5| 1500|       8|        pieces| 12000|            INR|12-10-2023|       Open|         0|
|     SO105|14-03-2024|         4|Ahmedabad|  India|        5| 8000|      10|        pieces| 80000|            INR|19-03-2024|       Open|         0|
|     SO101|08-01-2024|         1|Hyderabad|  India|        1| 3000|       4|        pieces| 12000| 

In [24]:
sales_part_df.repartition(1).write.format("csv") \
                 .option("header", True) \
                 .mode("ignore") \
                 .save("./files/sales_single")

In [28]:
sales_part_df.write.format("parquet") \
                .partitionBy("City") \
                 .option("header", True) \
                 .mode("ignore") \
                 .save("./files/sales_partitioned_parquet")

In [27]:
from pyspark.sql.functions import spark_partition_id
sales_part_df.withColumn("part_colum",spark_partition_id()) \
             .where("City = 'Ahmedabad'").show()

+----------+----------+----------+---------+-------+---------+-----+--------+--------------+------+---------------+----------+-----------+----------+
|SalesOrder| OrderDate|CustomerID|     City|Country|ProductID|Price|Qty_Sold|Qty_Sold_Units|Amount|Amount_Currency|  ShipDate|OrderStatus|part_colum|
+----------+----------+----------+---------+-------+---------+-----+--------+--------------+------+---------------+----------+-----------+----------+
|     SO105|14-03-2024|         4|Ahmedabad|  India|        5| 8000|      10|        pieces| 80000|            INR|19-03-2024|       Open|         0|
|     SO109|03-05-2024|         1|Ahmedabad|  India|        6|70000|       5|        pieces|350000|            INR|18-05-2024|       Open|         1|
|    SO1026|14-07-2023|         2|Ahmedabad|  India|        1| 1500|       8|        pieces| 12000|            INR|08-08-2023|       Open|         2|
|    SO1010|01-05-2024|         4|Ahmedabad|  India|        6| 2000|       7|        pieces| 14000| 

In [None]:
sales_df.repartition(1).write.format("parquet") \
               .mode("overwrite") \
               .save("./Input/Parquet/Sales")

customer_df.repartition(1).write.format("parquet") \
               .mode("overwrite") \
               .save("./Input/Parquet/Customer")

product_df.repartition(1).write.format("parquet") \
               .mode("overwrite") \
               .save("./Input/Parquet/Product")

In [None]:
sales_df.repartition(8).write.format("parquet") \
               .mode("overwrite") \
               .save("./Input/Multi_Parquet/Sales")


In [None]:
sales_df.repartition(1).write.format("orc") \
               .mode("overwrite") \
               .save("./Input/ORC/Sales")

customer_df.repartition(1).write.format("orc") \
               .mode("overwrite") \
               .save("./Input/ORC/Customer")

product_df.repartition(1).write.format("orc") \
               .mode("overwrite") \
               .save("./Input/ORC/Product")

In [None]:
sales_df.repartition(8).write.format("orc") \
               .mode("overwrite") \
               .save("./Input/Multi_ORC/Sales")


In [None]:
sales_df.write.partitionBy("City")\
                .format("parquet") \
               .mode("overwrite") \
               .save("./Input/city_partitioned/Sales")

In [None]:
sales_df.write.partitionBy("City","OrderStatus")\
                .format("csv") \
               .mode("overwrite") \
               .save("./Input/city_order_status_partitioned/Sales")

In [None]:
# mode overwrite, append, ignore, error