Data preprocessing for the global sales data in the udemy course below<br>
https://www.udemy.com/course/the-complete-power-bi-practical-course

In [9]:
from pyspark.sql import SparkSession

# Initialize a SparkSession
spark = SparkSession.builder.appName("GlobalSalesDataProcessing").getOrCreate()

# Read the CSV file
df = spark.read.csv('../input_data/udemy_ms_powerbi/factInternetSales.csv', header=True, inferSchema=True)

# Show the first few rows
df.show()

+----------+----------+----------+---------+-----------+------------+-----------------+----------------+--------------------+--------------+-------------+---------+--------------+--------------------+--------------+-------------------+----------------+-------+-------+--------+-------+----+--------+
|ProductKey| OrderDate|   DueDate| ShipDate|CustomerKey|PromotionKey|SalesTerritoryKey|SalesOrderNumber|SalesOrderLineNumber|RevisionNumber|OrderQuantity|UnitPrice|ExtendedAmount|UnitPriceDiscountPct|DiscountAmount|ProductStandardCost|TotalProductCost|     29|      6|      98|     19|  36|     100|
+----------+----------+----------+---------+-----------+------------+-----------------+----------------+--------------------+--------------+-------------+---------+--------------+--------------------+--------------+-------------------+----------------+-------+-------+--------+-------+----+--------+
|       310|29/12/2010|10/01/2011|5/01/2011|      21768|           1|                6|         SO43

In [13]:
# Rename 6 currency key columns prior to unpivot
ck_29 = 'ck_29'
ck_6 = 'ck_6'
ck_98 = 'ck_98'
ck_19 = 'ck_19'
ck_36 = 'ck_36'
ck_100 = 'ck_100'
df = df.withColumnRenamed('29', ck_29).withColumnRenamed('6', ck_6).withColumnRenamed('98', ck_98).withColumnRenamed('19', ck_19).withColumnRenamed('36', ck_36).withColumnRenamed('100', ck_100)
df.show()

+----------+----------+----------+---------+-----------+------------+-----------------+----------------+--------------------+--------------+-------------+---------+--------------+--------------------+--------------+-------------------+----------------+-------+-------+--------+-------+-----+--------+
|ProductKey| OrderDate|   DueDate| ShipDate|CustomerKey|PromotionKey|SalesTerritoryKey|SalesOrderNumber|SalesOrderLineNumber|RevisionNumber|OrderQuantity|UnitPrice|ExtendedAmount|UnitPriceDiscountPct|DiscountAmount|ProductStandardCost|TotalProductCost|  ck_29|   ck_6|   ck_98|  ck_19|ck_36|  ck_100|
+----------+----------+----------+---------+-----------+------------+-----------------+----------------+--------------------+--------------+-------------+---------+--------------+--------------------+--------------+-------------------+----------------+-------+-------+--------+-------+-----+--------+
|       310|29/12/2010|10/01/2011|5/01/2011|      21768|           1|                6|         S

In [14]:
# Unpivot to create the 2 columns: Currency Key & Sales Amount
from pyspark.sql.functions import expr, explode

df_unpivoted = (df.select(
    "ProductKey",
    "OrderDate",
    "DueDate",
    "ShipDate",
    "CustomerKey",
    "PromotionKey",
    "SalesTerritoryKey",
    "SalesOrderNumber",
    "SalesOrderLineNumber",
    "RevisionNumber",
    "OrderQuantity",
    "UnitPrice",
    "ExtendedAmount",
    "UnitPriceDiscountPct",
    "DiscountAmount",
    "ProductStandardCost",
    "TotalProductCost",
    explode(expr("array(struct(29 as Currency_Key, ck_29 as Sales_Amount), \
                         struct(6 as Currency_Key, ck_6 as Sales_Amount), \
                         struct(98 as Currency_Key, ck_98 as Sales_Amount), \
                         struct(19 as Currency_Key, ck_19 as Sales_Amount), \
                         struct(36 as Currency_Key, ck_36 as Sales_Amount), \
                         struct(100 as Currency_Key, ck_100 as Sales_Amount))")).alias("Currency_Amount")
).select(
    "ProductKey",
    "OrderDate",
    "DueDate",
    "ShipDate",
    "CustomerKey",
    "PromotionKey",
    "SalesTerritoryKey",
    "SalesOrderNumber",
    "SalesOrderLineNumber",
    "RevisionNumber",
    "OrderQuantity",
    "UnitPrice",
    "ExtendedAmount",
    "UnitPriceDiscountPct",
    "DiscountAmount",
    "ProductStandardCost",
    "TotalProductCost",
    "Currency_Amount.Currency_Key",
    "Currency_Amount.Sales_Amount")).withColumnRenamed("Currency_Key", "Currency Key").withColumnRenamed("Sales_Amount", "Sales Amount")

df_unpivoted.show()

+----------+----------+----------+---------+-----------+------------+-----------------+----------------+--------------------+--------------+-------------+---------+--------------+--------------------+--------------+-------------------+----------------+------------+------------+
|ProductKey| OrderDate|   DueDate| ShipDate|CustomerKey|PromotionKey|SalesTerritoryKey|SalesOrderNumber|SalesOrderLineNumber|RevisionNumber|OrderQuantity|UnitPrice|ExtendedAmount|UnitPriceDiscountPct|DiscountAmount|ProductStandardCost|TotalProductCost|Currency_Key|Sales_Amount|
+----------+----------+----------+---------+-----------+------------+-----------------+----------------+--------------------+--------------+-------------+---------+--------------+--------------------+--------------+-------------------+----------------+------------+------------+
|       310|29/12/2010|10/01/2011|5/01/2011|      21768|           1|                6|         SO43697|                   1|             1|            1|  3578.27