In [None]:
import env_setup
import pyspark.sql.functions as f

spark = env_setup.getSession(local=True)

Created local SparkSession


In [None]:
#TODO rename sales_with_prices_df to sales_df

# 0. ENV PREPARATION
spark.read.option("header", "true").csv("/Users/mkromka/dev/trainings/pySpark_workshop/data/sales.csv").createOrReplaceTempView("sales")
spark.read.option("header", "true").csv("/Users/mkromka/dev/trainings/pySpark_workshop/data/item_prices.csv").createOrReplaceTempView("item_prices")

sales_df = spark.table("sales")
item_prices_df = spark.table("item_prices")

sales_df.printSchema()
item_prices_df.printSchema()

sales_df.show()
item_prices_df.show()


In [None]:
# 1. SQL support
spark.sql('select item_id,transaction_date from sales where shop_id = "SHOP_1" order by transaction_date desc')\
    .show()
# TODO add other examples to rewrite

In [None]:
#2. Dataframe operations
sales_df.select("item_id", "transaction_date")\
    .filter(f.col("shop_id") == "SHOP_1")\
    .orderBy(f.col("transaction_date").desc())\
    .show()
    
sales_df.select(sales_df.item_id, sales_df.transaction_date)\
    .filter(sales_df.shop_id == "SHOP_1")\
    .orderBy(sales_df.transaction_date.desc())\
    .show()

In [None]:
# 3. JOINS
spark.sql('select * from sales join item_prices on sales.item_id = item_prices.item_id').show()

sales_df.join(item_prices_df, sales_df.item_id == item_prices_df.item_id, "inner").show()

sales_with_unit_prices_df = sales_df\
    .join(item_prices_df, sales_df.item_id == item_prices_df.item_id)\
    .drop(sales_df.item_id)
    
sales_with_unit_prices_df.show()

#Filter out excluded items
excluded_items_df = spark.createDataFrame([("ITEM_2",),("ITEM_4",)], ['item'])

excluded_items_df.show()

# using join and filtering
sales_df.join(excluded_items_df, sales_df.item_id == excluded_items_df.item, "left_outer")\
    .filter(f.isnull(excluded_items_df.item))\
    .drop(excluded_items_df.item)\
    .show()
    
# better option: anti join
sales_df.join(excluded_items_df, sales_df.item_id == excluded_items_df.item, "left_anti")\
    .show()

In [None]:
# 4. Adding columns
sales_with_prices_df = sales_with_unit_prices_df\
    .withColumn("total_sales", f.col("qty") * f.col("unit_price"))
    
sales_with_prices_df.show()


In [None]:
# 5. Simple aggregations
sales_with_prices_df\
    .groupBy("shop_id")\
    .agg(f.sum(sales_with_prices_df.total_sales))\
    .orderBy(f.col("sum(total_sales)")).show()
    
#using alias to avoid strange column names
sales_with_prices_df\
    .groupBy("shop_id")\
    .agg(f.sum(sales_with_prices_df.total_sales).alias("sales"))\
    .orderBy(f.col("sales").desc())\
    .show()
    # .orderBy(sales_with_prices_df.sales) won't work as sales_with_prices has no price column (we define it later)
    
# produce a list of all shops where each item was sold
sales_with_prices_df\
    .groupBy("item_id")\
    .agg(f.collect_list(f.col("shop_id")).alias("shops"))\
    .show()

In [None]:
# 6. Date handling
sales_with_prices_df\
    .withColumn("year", f.year(f.col("transaction_date")))\
    .withColumn("month", f.month(f.col("transaction_date")))\
    .withColumn("day", f.dayofmonth(f.col("transaction_date")))\
    .withColumn("day_of_year", f.dayofyear(f.col("transaction_date")))\
    .withColumn("day_of_week", f.date_format(f.col("transaction_date"), 'u'))\
    .withColumn("day_of_week_string", f.date_format(f.col("transaction_date"), 'E'))\
    .withColumn("week_of_year", f.weekofyear(f.col("transaction_date")))\
    .show()
    

# aggregate sales by week
sales_with_prices_df\
    .groupBy(f.weekofyear(f.col("transaction_date")))\
    .agg(f.sum(f.col("total_sales")))\
    .show()

# Weekly aggregations not starting on Monday    
sales_with_prices_df\
    .withColumn("transaction_date_moved", f.date_add(f.col("transaction_date"), 1))\
    .groupBy(f.weekofyear(f.col("transaction_date_moved")))\
    .agg(f.sum(f.col("total_sales")))\
    .show()

#problem with incorrect value of the week_of_year, enough for callulations that require ordering

# Different solution where we preserve last day of every week 
#"Sat" can be seen as a day where week ends
sales_with_prices_df\
    .withColumn("aggr_date", f.next_day(f.date_sub(f.col("transaction_date"), 1), "Sat"))\
    .groupBy(f.col("aggr_date"))\
    .agg(f.sum(f.col("total_sales")))\
    .withColumn("day_of_week_string",  f.date_format(f.col("aggr_date"), 'E'))\
    .show()


In [None]:
# 7. Using result of one query in another
#max date globaly
sales_with_prices_df\
    .select(f.max(f.col("transaction_date")).alias("max_date"))\
    .show()
    
#how to add it to every column
# 1. using collect/first
max_date = sales_with_prices_df\
    .select(f.max(f.col("transaction_date")).alias("max_date"))\
    .first()[0] #first returns first row, collect returns list of rows
    #.collect()[0][0]

sales_with_max_global_date_df = sales_with_prices_df\
    .withColumn("global_max_date", f.lit(max_date))\
    .show()
    
#2. using crossJoin (doesn't require invoking action)
max_date_df = sales_with_prices_df\
    .select(f.max(f.col("transaction_date")).alias("max_date"))
    
sales_with_max_global_date_cross_join_df = sales_with_prices_df\
    .crossJoin(f.broadcast(max_date_df))\
    .show()
#make sure DF inside cross join has only one element, if not then we'll have too many rows


In [None]:
# 8. Window functions
# get max transaction date for each shop

max_date_by_store_df = sales_with_prices_df\
    .groupBy(f.col("shop_id"))\
    .agg(f.max("transaction_date").alias("max_transaction_date_by_shop")) 
    
sales_with_prices_df.join(max_date_by_store_df, ["shop_id"])\
    .show()
#careful: "shop_id" in join is not column - just a string. Can be also a list of strings
#no need to drop column

#another option is to use Windows
from pyspark.sql import Window

window = Window.partitionBy(f.col("shop_id"))

sales_with_prices_df\
    .withColumn("max_transaction_date_by_shop", f.max(f.col("transaction_date")).over(window)).show()
    
#Find ordinals for transactions for each item_id (so the oldest transaction with given item_id should be 1)
window_by_item_sorted = Window.partitionBy(f.col("item_id")).orderBy(f.col("transaction_date"))

sales_with_prices_df\
    .withColumn("item_transaction_ordinal", f.rank().over(window_by_item_sorted))\
    .show()
    
#Find average of prices from last two transactions in given shop ordered by transaction date
window_by_transaction_date = Window\
    .partitionBy(f.col("shop_id"))\
    .orderBy(f.col("transaction_date"))\
    .rowsBetween(-1,Window.currentRow)

sales_with_prices_df\
    .withColumn("price_moving_average", f.mean(f.col("total_sales")).over(window_by_transaction_date))\
    .orderBy(f.col("shop_id"), f.col("transaction_date"))\
    .show()
    
#find average of prices from current and all previous transactions in given shop ordered by transaction date
unbounded_window_by_transaction_date = Window\
    .partitionBy(f.col("shop_id"))\
    .orderBy(f.col("transaction_date"))\
    .rowsBetween(Window.unboundedPreceding,Window.currentRow)
    
sales_with_prices_df\
    .withColumn("average_price_until_now", f.mean(f.col("total_sales")).over(unbounded_window_by_transaction_date))\
    .orderBy(f.col("shop_id"), f.col("transaction_date"))\
    .show()


In [None]:
# 9. Complex aggregations

# produce one row per shop and a list of all transactions with week and year numbers for given store in one column

# produce weekly sales by shop
weekly_sales_by_shop_df = sales_with_prices_df\
    .groupBy("shop_id", f.weekofyear("transaction_date").alias("week"), f.year("transaction_date").alias("year"))\
    .agg(f.sum("total_sales").alias("sales"))
    
weekly_sales_by_shop_df.show()
        
shop_sales_weekly_series_df = weekly_sales_by_shop_df\
    .groupBy("shop_id")\
    .agg(f.collect_list("week"),f.collect_list("year"),  f.collect_list("sales"))

shop_sales_weekly_series_df.show(truncate=False)
#solution above won't work as ordering in each column may be different
    
# shop_sales_weekly_series_df = weekly_sales_by_shop_df\
#     .groupBy("shop_id")\
#     .agg(f.collect_list(["sales", "week"]))
# won't work, can't collect more than one column

shop_sales_weekly_series_df = weekly_sales_by_shop_df\
    .groupBy("shop_id")\
    .agg(f.collect_list(f.struct(["year", "week", "sales"])).alias("sales_ts"))
    
shop_sales_weekly_series_df.show(truncate=False)

# what about sorting?
# we could do it before aggregation:

ordered_weekly_sales_df = weekly_sales_by_shop_df\
    .orderBy("shop_id", "year", "week")
    
ordered_weekly_sales_df.show()

wrongly_sorted_series_df = ordered_weekly_sales_df\
    .groupBy("shop_id")\
    .agg(f.collect_list(f.struct(["year", "week", "sales"])).alias("sales_ts"))
    
wrongly_sorted_series_df.show(truncate=False)
#it won't work, because collect_list may not preserve ordering!

#we need to sort it for evetry row - and to do that we need UDFs - User Defined Functions


In [None]:
# 10. Defining custom UDFs
