In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local[*]") \
                    .appName("Create Dataframe") \
                    .getOrCreate()

In [None]:
spark.stop()

In [None]:
# Parquet
sales_parquet_df = spark.read.format("parquet") \
                             .load("./Input/Parquet/Sales/part-00000-b1595b31-3ad4-48b7-9f05-a98f4897b19b-c000.snappy.parquet")

In [None]:
sales_parquet_df.show()
# sales_parquet_df.printSchema()

In [None]:
# ORC
# Parquet
sales_orc_df = spark.read.format("ORC") \
                             .load("./Input/ORC/Sales/part-00000-7675c4db-367f-445a-bc43-5f03f92fefd0-c000.snappy.orc")

In [None]:
sales_orc_df.printSchema()
# sales_orc_df.show()

In [None]:
# Folder
# Parquet
sales_parquet_mdf = spark.read.format("parquet") \
                             .load("./Input/Multi_Parquet/Sales/")

In [None]:
from pyspark.sql.functions import input_file_name, current_timestamp

sales_parquet_mdf.withColumn("filename",input_file_name()) \
                 .withColumn("UpdateTimestamp", current_timestamp()).show(truncate=False)

In [16]:
sales_parquet_city_pdf = spark.read.format("parquet") \
                             .load("./Input/city_partitioned/Sales/")

In [18]:
sales_parquet_city_pdf.head(40)

[Row(SalesOrder='SO102', OrderDate='19-01-2024', CustomerID=2, Country='India', ProductID=2, Price=1500, Qty_Sold=5, Qty_Sold_Units='pieces', Amount=7500, Amount_Currency='INR', ShipDate='29-01-2024', OrderStatus='Open', City='Mumbai'),
 Row(SalesOrder='SO107', OrderDate='10-03-2024', CustomerID=1, Country='India', ProductID=7, Price=10000, Qty_Sold=2, Qty_Sold_Units='pieces', Amount=20000, Amount_Currency='INR', ShipDate='15-03-2024', OrderStatus='Hold', City='Mumbai'),
 Row(SalesOrder='SO1012', OrderDate='08-07-2024', CustomerID=3, Country='India', ProductID=1, Price=3000, Qty_Sold=5, Qty_Sold_Units='pieces', Amount=15000, Amount_Currency='INR', ShipDate='23-07-2024', OrderStatus='Open', City='Mumbai'),
 Row(SalesOrder='SO1013', OrderDate='14-08-2024', CustomerID=4, Country='India', ProductID=1, Price=10000, Qty_Sold=10, Qty_Sold_Units='pieces', Amount=100000, Amount_Currency='INR', ShipDate='29-08-2024', OrderStatus='Open', City='Mumbai'),
 Row(SalesOrder='SO1017', OrderDate='01-01-

In [None]:
# sales_parquet_mdf.printSchema()
sales_parquet_mdf.show()

In [19]:
sales_csv_city_mpdf = spark.read.format("csv") \
                             .option("header", True) \
                             .load("./Input/city_order_status_partitioned/Sales/")

In [20]:
sales_csv_city_mpdf.show()

+------+----------+---+-----+---+-----+---+------+------+---+----------+---------+-----------+
| SO105|14-03-2024|  4|India|  5| 8000| 10|pieces| 80000|INR|19-03-2024|     City|OrderStatus|
+------+----------+---+-----+---+-----+---+------+------+---+----------+---------+-----------+
| SO109|03-05-2024|  1|India|  6|70000|  5|pieces|350000|INR|18-05-2024|Ahmedabad|       Open|
|SO1014|02-09-2024|  5|India|  3|10000| 10|pieces|100000|INR|17-09-2024|Ahmedabad|       Open|
|SO1016|20-01-2023|  4|India|  7| 8000|  4|pieces| 32000|INR|09-02-2023|Ahmedabad|       Open|
|SO1026|14-07-2023|  2|India|  1| 1500|  8|pieces| 12000|INR|08-08-2023|Ahmedabad|       Open|
| SO104|30-03-2024|  2|India|  4|10000|  8|pieces| 80000|INR|09-04-2024|     Pune|     Closed|
|SO1018|26-01-2023|  4|India|  3|10000|  5|pieces| 50000|INR|15-02-2023|     Pune|     Closed|
|SO1020|24-03-2023|  1|India|  4| 5000|  5|pieces| 25000|INR|13-04-2023|     Pune|     Closed|
|SO1022|30-04-2023|  1|India|  3|70000|  5|pieces|

In [None]:
from pyspark.sql.functions import input_file_name

sales_parquet_mpdf.withColumn("inpu_file",input_file_name()).show(truncate=False)

In [None]:
from pyspark.sql.functions import input_file_name, regexp_extract

sales_parquet_mpdf.withColumn("inpu_file",input_file_name()) \
                .withColumn(
                        "City", regexp_extract(input_file_name(), r"City=([^/]+)", 1)
                    ).withColumn(
                    "OrderStatus", regexp_extract(input_file_name(), r"OrderStatus=([^/]+)", 1)
                ).show()

In [None]:
# Recursive
# .option("recursiveFileLookup",True)
MainFolder
|_ FolderA - > file1
|_ FolderA / FolderB - file 2, 3

In [21]:
sales_parquet_rpdf = spark.read.format("parquet") \
                            .option("recursiveFileLookup",True) \
                             .load("./Input/MainFolder")

In [22]:
sales_parquet_rpdf.withColumn("inpu_file",input_file_name()).show(truncate=False)

+----------+----------+----------+---------+-------+---------+-----+--------+--------------+------+---------------+----------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|SalesOrder|OrderDate |CustomerID|City     |Country|ProductID|Price|Qty_Sold|Qty_Sold_Units|Amount|Amount_Currency|ShipDate  |OrderStatus|inpu_file                                                                                                                                                                                    |
+----------+----------+----------+---------+-------+---------+-----+--------+--------------+------+---------------+----------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|SO1013    |1