In [1]:
import env_setup
import pyspark.sql.functions as f

spark = env_setup.getSession(local=True)

Created local SparkSession
Created "sales" view from CSV file
Created "item_prices" view from CSV file


In [2]:
#TODO rename sales_with_prices_df to sales_df

# 0. ENV PREPARATION
sales_df = spark.table("sales")
item_prices_df = spark.table("item_prices")

sales_df.printSchema()
item_prices_df.printSchema()

sales_df.show()
item_prices_df.show()


root
 |-- shop_id: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- qty: string (nullable = true)
 |-- transaction_date: string (nullable = true)

root
 |-- item_id: string (nullable = true)
 |-- unit_price: string (nullable = true)

+-------+-------+---+----------------+
|shop_id|item_id|qty|transaction_date|
+-------+-------+---+----------------+
| SHOP_1| ITEM_1|  2|      2018-02-01|
| SHOP_1| ITEM_2|  1|      2018-02-01|
| SHOP_1| ITEM_3|  4|      2018-02-10|
| SHOP_2| ITEM_3|  1|      2018-02-02|
| SHOP_2| ITEM_1|  1|      2018-02-11|
+-------+-------+---+----------------+

+-------+----------+
|item_id|unit_price|
+-------+----------+
| ITEM_1|     100.0|
| ITEM_2|     300.0|
| ITEM_3|      50.0|
+-------+----------+



In [3]:
# 1. SQL support
spark.sql('select item_id,transaction_date from sales where shop_id = "SHOP_1" order by transaction_date desc')\
    .show()

# ex1. Using plain SQL query select all transactions for with quantity = 1
spark.sql('select * from sales where qty = 1').show()
# ex2. get mean unit price for all items
spark.sql('select mean(unit_price) from item_prices').show()

+-------+----------------+
|item_id|transaction_date|
+-------+----------------+
| ITEM_3|      2018-02-10|
| ITEM_1|      2018-02-01|
| ITEM_2|      2018-02-01|
+-------+----------------+

+-------+-------+---+----------------+
|shop_id|item_id|qty|transaction_date|
+-------+-------+---+----------------+
| SHOP_1| ITEM_2|  1|      2018-02-01|
| SHOP_2| ITEM_3|  1|      2018-02-02|
| SHOP_2| ITEM_1|  1|      2018-02-11|
+-------+-------+---+----------------+

+-------------------------------+
|avg(CAST(unit_price AS DOUBLE))|
+-------------------------------+
|                          150.0|
+-------------------------------+



In [4]:
#2. Dataframe operations
sales_df.select("item_id", "transaction_date")\
    .filter(f.col("shop_id") == "SHOP_1")\
    .orderBy(f.col("transaction_date").desc())\
    .show()
    
sales_df.select(sales_df.item_id, sales_df.transaction_date)\
    .filter(sales_df.shop_id == "SHOP_1")\
    .orderBy(sales_df.transaction_date.desc())\
    .show()


#ex3. rewrite query from ex1 to dataframe operations
sales_df.filter(f.col("qty") == 1).show()

+-------+----------------+
|item_id|transaction_date|
+-------+----------------+
| ITEM_3|      2018-02-10|
| ITEM_1|      2018-02-01|
| ITEM_2|      2018-02-01|
+-------+----------------+

+-------+----------------+
|item_id|transaction_date|
+-------+----------------+
| ITEM_3|      2018-02-10|
| ITEM_1|      2018-02-01|
| ITEM_2|      2018-02-01|
+-------+----------------+

+-------+-------+---+----------------+
|shop_id|item_id|qty|transaction_date|
+-------+-------+---+----------------+
| SHOP_1| ITEM_2|  1|      2018-02-01|
| SHOP_2| ITEM_3|  1|      2018-02-02|
| SHOP_2| ITEM_1|  1|      2018-02-11|
+-------+-------+---+----------------+



In [5]:
# 3. JOINS
spark.sql('select * from sales join item_prices on sales.item_id = item_prices.item_id').show()

sales_df.join(item_prices_df, sales_df.item_id == item_prices_df.item_id, "inner").show()

sales_with_unit_prices_df = sales_df\
    .join(item_prices_df, sales_df.item_id == item_prices_df.item_id)\
    .drop(sales_df.item_id)
    
sales_with_unit_prices_df.show()

#ex4. Filter out excluded items
excluded_items_df = spark.createDataFrame([("ITEM_2",),("ITEM_4",)], ['item'])

excluded_items_df.show()

# using join and filtering
sales_df.join(excluded_items_df, sales_df.item_id == excluded_items_df.item, "left_outer")\
    .filter(f.isnull(excluded_items_df.item))\
    .drop(excluded_items_df.item)\
    .show()
    
# better option: anti join
sales_df.join(excluded_items_df, sales_df.item_id == excluded_items_df.item, "left_anti")\
    .show()


+-------+-------+---+----------------+-------+----------+
|shop_id|item_id|qty|transaction_date|item_id|unit_price|
+-------+-------+---+----------------+-------+----------+
| SHOP_1| ITEM_1|  2|      2018-02-01| ITEM_1|     100.0|
| SHOP_1| ITEM_2|  1|      2018-02-01| ITEM_2|     300.0|
| SHOP_1| ITEM_3|  4|      2018-02-10| ITEM_3|      50.0|
| SHOP_2| ITEM_3|  1|      2018-02-02| ITEM_3|      50.0|
| SHOP_2| ITEM_1|  1|      2018-02-11| ITEM_1|     100.0|
+-------+-------+---+----------------+-------+----------+

+-------+-------+---+----------------+-------+----------+
|shop_id|item_id|qty|transaction_date|item_id|unit_price|
+-------+-------+---+----------------+-------+----------+
| SHOP_1| ITEM_1|  2|      2018-02-01| ITEM_1|     100.0|
| SHOP_1| ITEM_2|  1|      2018-02-01| ITEM_2|     300.0|
| SHOP_1| ITEM_3|  4|      2018-02-10| ITEM_3|      50.0|
| SHOP_2| ITEM_3|  1|      2018-02-02| ITEM_3|      50.0|
| SHOP_2| ITEM_1|  1|      2018-02-11| ITEM_1|     100.0|
+-------+----

In [6]:
# 4. Adding columns
sales_with_prices_df = sales_with_unit_prices_df\
    .withColumn("total_sales", f.col("qty") * f.col("unit_price"))
    
sales_with_prices_df.show()

# Adding column based on a condition
sales_with_transaction_category = sales_with_prices_df\
    .withColumn("price_category", \
                f.when(f.col("total_sales") > 150, "High")\
                .when(f.col("total_sales") < 60, "Low")\
                .otherwise("Medium"))

sales_with_transaction_category.show()


#ex5. We want to create two-packs of items, but their price must be lower than 360, choose those items.
#hint: use cross join, and alias
item_prices_df.alias("items1")\
    .crossJoin(item_prices_df.alias("items2"))\
    .withColumn("price_sum",f.col("items1.unit_price") + f.col("items2.unit_price"))\
    .where((f.col("price_sum") < 360) & (f.col("items1.item_id") != f.col("items2.item_id")))\
    .select("items1.item_id", "items2.item_id", "price_sum")\
    .show()

+-------+---+----------------+-------+----------+-----------+
|shop_id|qty|transaction_date|item_id|unit_price|total_sales|
+-------+---+----------------+-------+----------+-----------+
| SHOP_1|  2|      2018-02-01| ITEM_1|     100.0|      200.0|
| SHOP_1|  1|      2018-02-01| ITEM_2|     300.0|      300.0|
| SHOP_1|  4|      2018-02-10| ITEM_3|      50.0|      200.0|
| SHOP_2|  1|      2018-02-02| ITEM_3|      50.0|       50.0|
| SHOP_2|  1|      2018-02-11| ITEM_1|     100.0|      100.0|
+-------+---+----------------+-------+----------+-----------+

+-------+---+----------------+-------+----------+-----------+--------------+
|shop_id|qty|transaction_date|item_id|unit_price|total_sales|price_category|
+-------+---+----------------+-------+----------+-----------+--------------+
| SHOP_1|  2|      2018-02-01| ITEM_1|     100.0|      200.0|          High|
| SHOP_1|  1|      2018-02-01| ITEM_2|     300.0|      300.0|          High|
| SHOP_1|  4|      2018-02-10| ITEM_3|      50.0|      2

In [7]:
# 5. Simple aggregations
sales_with_prices_df\
    .groupBy("shop_id")\
    .agg(f.sum(sales_with_prices_df.total_sales))\
    .orderBy(f.col("sum(total_sales)")).show()
    
#using alias to avoid strange column names
sales_with_prices_df\
    .groupBy("shop_id")\
    .agg(f.sum(sales_with_prices_df.total_sales).alias("sales"))\
    .orderBy(f.col("sales").desc())\
    .show()
    # .orderBy(sales_with_prices_df.sales) won't work as sales_with_prices has no price column (we define it later)
    
# ex6. produce a list of all shops where each item was sold, new column should be named "shops"
# hint: collect_list function
sales_with_prices_df\
    .groupBy("item_id")\
    .agg(f.collect_list(f.col("shop_id")).alias("shops"))\
    .show()

+-------+----------------+
|shop_id|sum(total_sales)|
+-------+----------------+
| SHOP_2|           150.0|
| SHOP_1|           700.0|
+-------+----------------+

+-------+-----+
|shop_id|sales|
+-------+-----+
| SHOP_1|700.0|
| SHOP_2|150.0|
+-------+-----+

+-------+----------------+
|item_id|           shops|
+-------+----------------+
| ITEM_3|[SHOP_1, SHOP_2]|
| ITEM_2|        [SHOP_1]|
| ITEM_1|[SHOP_1, SHOP_2]|
+-------+----------------+



In [8]:
# 6. Date handling
sales_with_prices_df\
    .withColumn("year", f.year(f.col("transaction_date")))\
    .withColumn("month", f.month(f.col("transaction_date")))\
    .withColumn("day", f.dayofmonth(f.col("transaction_date")))\
    .withColumn("day_of_year", f.dayofyear(f.col("transaction_date")))\
    .withColumn("day_of_week", f.date_format(f.col("transaction_date"), 'u'))\
    .withColumn("day_of_week_string", f.date_format(f.col("transaction_date"), 'E'))\
    .withColumn("week_of_year", f.weekofyear(f.col("transaction_date")))\
    .show()
    

# aggregate sales by week
sales_with_prices_df\
    .groupBy(f.weekofyear(f.col("transaction_date")))\
    .agg(f.sum(f.col("total_sales")))\
    .show()

# ex7. Weekly sales aggregation not starting on Monday
sales_with_prices_df\
    .withColumn("transaction_date_moved", f.date_add(f.col("transaction_date"), 1))\
    .groupBy(f.weekofyear(f.col("transaction_date_moved")))\
    .agg(f.sum(f.col("total_sales")))\
    .show()

# Unfortunately week_of_year column will have incorrect values (shifted by 1 day)
# but that's not a problem for calculations that require only ordering

# Different solution where we preserve last day of every week 
#"Sat" can be seen as a day where week ends
sales_with_prices_df\
    .withColumn("aggr_date", f.next_day(f.date_sub(f.col("transaction_date"), 1), "Sat"))\
    .groupBy(f.col("aggr_date"))\
    .agg(f.sum(f.col("total_sales")))\
    .withColumn("day_of_week_string",  f.date_format(f.col("aggr_date"), 'E'))\
    .show()


+-------+---+----------------+-------+----------+-----------+----+-----+---+-----------+-----------+------------------+------------+
|shop_id|qty|transaction_date|item_id|unit_price|total_sales|year|month|day|day_of_year|day_of_week|day_of_week_string|week_of_year|
+-------+---+----------------+-------+----------+-----------+----+-----+---+-----------+-----------+------------------+------------+
| SHOP_1|  2|      2018-02-01| ITEM_1|     100.0|      200.0|2018|    2|  1|         32|          4|               Thu|           5|
| SHOP_1|  1|      2018-02-01| ITEM_2|     300.0|      300.0|2018|    2|  1|         32|          4|               Thu|           5|
| SHOP_1|  4|      2018-02-10| ITEM_3|      50.0|      200.0|2018|    2| 10|         41|          6|               Sat|           6|
| SHOP_2|  1|      2018-02-02| ITEM_3|      50.0|       50.0|2018|    2|  2|         33|          5|               Fri|           5|
| SHOP_2|  1|      2018-02-11| ITEM_1|     100.0|      100.0|2018|   

In [9]:
# 7. Using result of one query in another
#max date globaly
sales_with_prices_df\
    .select(f.max(f.col("transaction_date")).alias("max_date"))\
    .show()
    
#how to add it to every column
# 1. using collect/first
max_date = sales_with_prices_df\
    .select(f.max(f.col("transaction_date")).alias("max_date"))\
    .first()[0] #first returns first row, collect returns list of rows
    #.collect()[0][0]

sales_with_max_global_date_df = sales_with_prices_df\
    .withColumn("global_max_date", f.lit(max_date))\
    .show()

# ex8. 
# 2. using crossJoin (doesn't require invoking action - collect)
max_date_df = sales_with_prices_df\
    .select(f.max(f.col("transaction_date")).alias("max_date"))
    
sales_with_max_global_date_cross_join_df = sales_with_prices_df\
    .crossJoin(f.broadcast(max_date_df))\
    .show()
#make sure DF inside cross join has only one element, if not then we'll have too many rows


+----------+
|  max_date|
+----------+
|2018-02-11|
+----------+

+-------+---+----------------+-------+----------+-----------+---------------+
|shop_id|qty|transaction_date|item_id|unit_price|total_sales|global_max_date|
+-------+---+----------------+-------+----------+-----------+---------------+
| SHOP_1|  2|      2018-02-01| ITEM_1|     100.0|      200.0|     2018-02-11|
| SHOP_1|  1|      2018-02-01| ITEM_2|     300.0|      300.0|     2018-02-11|
| SHOP_1|  4|      2018-02-10| ITEM_3|      50.0|      200.0|     2018-02-11|
| SHOP_2|  1|      2018-02-02| ITEM_3|      50.0|       50.0|     2018-02-11|
| SHOP_2|  1|      2018-02-11| ITEM_1|     100.0|      100.0|     2018-02-11|
+-------+---+----------------+-------+----------+-----------+---------------+

+-------+---+----------------+-------+----------+-----------+----------+
|shop_id|qty|transaction_date|item_id|unit_price|total_sales|  max_date|
+-------+---+----------------+-------+----------+-----------+----------+
| SHOP_1|  2

In [10]:
# 8. Window functions
# get max transaction date for each shop

max_date_by_store_df = sales_with_prices_df\
    .groupBy(f.col("shop_id"))\
    .agg(f.max("transaction_date").alias("max_transaction_date_by_shop")) 
    
sales_with_prices_df.join(max_date_by_store_df, ["shop_id"])\
    .show()
#careful: "shop_id" in join is not column - just a string. Can be also a list of strings
#no need to drop column

#another option is to use Windows
#Note: Windows are experimental feature (even though they're available since Spark 1.4
from pyspark.sql import Window

window = Window.partitionBy(f.col("shop_id"))

sales_with_prices_df\
    .withColumn("max_transaction_date_by_shop", f.max(f.col("transaction_date")).over(window)).show()
    
#Find ordinals for transactions for each item_id (so the oldest transaction with given item_id should be 1)
window_by_item_sorted = Window.partitionBy(f.col("item_id")).orderBy(f.col("transaction_date"))

sales_with_prices_df\
    .withColumn("item_transaction_ordinal", f.rank().over(window_by_item_sorted))\
    .show()
    
#Find average of prices from last two transactions in given shop ordered by transaction date
window_by_transaction_date = Window\
    .partitionBy(f.col("shop_id"))\
    .orderBy(f.col("transaction_date"))\
    .rowsBetween(-1,Window.currentRow)

sales_with_prices_df\
    .withColumn("price_moving_average", f.mean(f.col("total_sales")).over(window_by_transaction_date))\
    .orderBy(f.col("shop_id"), f.col("transaction_date"))\
    .show()
    
#ex9. Find average of prices from current and all previous transactions in given shop ordered by transaction date
unbounded_window_by_transaction_date = Window\
    .partitionBy(f.col("shop_id"))\
    .orderBy(f.col("transaction_date"))\
    .rowsBetween(Window.unboundedPreceding,Window.currentRow)
    
sales_with_prices_df\
    .withColumn("average_price_until_now", f.mean(f.col("total_sales")).over(unbounded_window_by_transaction_date))\
    .orderBy(f.col("shop_id"), f.col("transaction_date"))\
    .show()


+-------+---+----------------+-------+----------+-----------+----------------------------+
|shop_id|qty|transaction_date|item_id|unit_price|total_sales|max_transaction_date_by_shop|
+-------+---+----------------+-------+----------+-----------+----------------------------+
| SHOP_1|  2|      2018-02-01| ITEM_1|     100.0|      200.0|                  2018-02-10|
| SHOP_1|  1|      2018-02-01| ITEM_2|     300.0|      300.0|                  2018-02-10|
| SHOP_1|  4|      2018-02-10| ITEM_3|      50.0|      200.0|                  2018-02-10|
| SHOP_2|  1|      2018-02-02| ITEM_3|      50.0|       50.0|                  2018-02-11|
| SHOP_2|  1|      2018-02-11| ITEM_1|     100.0|      100.0|                  2018-02-11|
+-------+---+----------------+-------+----------+-----------+----------------------------+

+-------+---+----------------+-------+----------+-----------+----------------------------+
|shop_id|qty|transaction_date|item_id|unit_price|total_sales|max_transaction_date_by_shop

In [11]:
# 9. Complex aggregations

# produce one row per shop and a list of all transactions with week and year numbers for given store in one column

# produce weekly sales by shop
weekly_sales_by_shop_df = sales_with_prices_df\
    .groupBy("shop_id", f.weekofyear("transaction_date").alias("week"), f.year("transaction_date").alias("year"))\
    .agg(f.sum("total_sales").alias("sales"))
    
weekly_sales_by_shop_df.show()
        
shop_sales_weekly_series_df = weekly_sales_by_shop_df\
    .groupBy("shop_id")\
    .agg(f.collect_list("week"),f.collect_list("year"),  f.collect_list("sales"))

shop_sales_weekly_series_df.show(truncate=False)
#solution above won't work as ordering in each column may be different
    
# shop_sales_weekly_series_df = weekly_sales_by_shop_df\
#     .groupBy("shop_id")\
#     .agg(f.collect_list(["sales", "week"]))
# won't work, can't collect more than one column

shop_sales_weekly_series_df = weekly_sales_by_shop_df\
    .groupBy("shop_id")\
    .agg(f.collect_list(f.struct(["year", "week", "sales"])).alias("sales_ts"))
    
shop_sales_weekly_series_df.show(truncate=False)

# what about sorting?
# we could do it before aggregation:

ordered_weekly_sales_df = weekly_sales_by_shop_df\
    .orderBy("shop_id", "year", "week")
    
ordered_weekly_sales_df.show()

wrongly_sorted_series_df = ordered_weekly_sales_df\
    .groupBy("shop_id")\
    .agg(f.collect_list(f.struct(["year", "week", "sales"])).alias("sales_ts"))
    
wrongly_sorted_series_df.show(truncate=False)
#it won't work, because collect_list may not preserve ordering!

#we need to sort it for evetry row - and to do that we need UDFs - User Defined Functions


+-------+----+----+-----+
|shop_id|week|year|sales|
+-------+----+----+-----+
| SHOP_2|   5|2018| 50.0|
| SHOP_1|   5|2018|500.0|
| SHOP_1|   6|2018|200.0|
| SHOP_2|   6|2018|100.0|
+-------+----+----+-----+

+-------+------------------+------------------+-------------------+
|shop_id|collect_list(week)|collect_list(year)|collect_list(sales)|
+-------+------------------+------------------+-------------------+
|SHOP_2 |[5, 6]            |[2018, 2018]      |[50.0, 100.0]      |
|SHOP_1 |[5, 6]            |[2018, 2018]      |[500.0, 200.0]     |
+-------+------------------+------------------+-------------------+

+-------+--------------------------------+
|shop_id|sales_ts                        |
+-------+--------------------------------+
|SHOP_2 |[[2018,5,50.0], [2018,6,100.0]] |
|SHOP_1 |[[2018,5,500.0], [2018,6,200.0]]|
+-------+--------------------------------+

+-------+----+----+-----+
|shop_id|week|year|sales|
+-------+----+----+-----+
| SHOP_1|   5|2018|500.0|
| SHOP_1|   6|2018|

In [26]:
# 10. Defining custom UDFs
def my_custom_function(column1):
    return "AFTER_UDF_" + str(column1)

my_custom_udf = f.udf(my_custom_function)

df_after_udf = shop_sales_weekly_series_df.withColumn("sales_ts_after_udf", my_custom_udf(f.col("sales_ts")))
df_after_udf.show()
df_after_udf.printSchema()

#we can register our UDF in catalog and use it in SQL query
from pyspark import SparkContext
from pyspark.sql import SQLContext

sqlContext = SQLContext(spark.sparkContext)
sqlContext.registerFunction("my_udf", my_custom_function)

spark.sql("select my_udf(shop_id) from sales").show()

# TODO typed version of UDFs
from pyspark.sql.types import IntegerType, StringType, StructType, ArrayType, StructField


#TODO what happens when string type and we return INT and in other way

# returning multiple columns from UDF

def split_shop_id(shop_id):
    s, i = shop_id.split("_")
    return s, int(i) #must be cast to int, otherwise will return null

split_shop_id_udf = f.udf(split_shop_id)

df_udf_no_schema = shop_sales_weekly_series_df.withColumn("shop_id_splits", split_shop_id_udf(f.col("shop_id")))
df_udf_no_schema.show(truncate=False)
df_udf_no_schema.printSchema()

schema = StructType([StructField("s", StringType()), StructField("i", IntegerType())])
udf_with_schema = f.udf(split_shop_id, schema)

df = df_udf_no_schema.withColumn("shop_id_splits_with_schema", udf_with_schema(f.col("shop_id")))
df.show(truncate=False)
df.printSchema()

#how to get each result element as separate column?
df_split_shop_id = df.select("*", "shop_id_splits_with_schema.*").drop("shop_id_splits_with_schema")
df_split_shop_id.show()
df_split_shop_id.printSchema()

#Solution above will invoke UDF as many times a there are new columns created - it's a pySpark bug fixed in Spark 2.3
# https://issues.apache.org/jira/browse/SPARK-17728
#workaround
df_split_shop_id_correct = df_udf_no_schema.withColumn("shop_id_splits_with_schema", \
                                 f.explode(f.array(udf_with_schema(f.col("shop_id")))))

df_split_shop_id_correct = df_split_shop_id_correct \
    .select("*", "shop_id_splits_with_schema.*") \
    .drop("shop_id_splits_with_schema")
df_split_shop_id_correct.show()
df_split_shop_id_correct.printSchema()
    
df_split_shop_id.explain()
df_split_shop_id_correct.explain()

#ex.10 sort each time series from previous part in descending order and compare to initial ts (tip: use sorted method)
from pyspark.sql.types import FloatType, ArrayType

def sort_ts(ts):
    s_ts = sorted(ts, key=lambda row: (-row.week, -row.year))
    return s_ts

sort_ts_udf = f.udf(sort_ts, ArrayType(StructType(
            [StructField("year", IntegerType()),
             StructField("week", IntegerType()),
             StructField("sales", FloatType())])))

sorted_ts_df = wrongly_sorted_series_df.withColumn("sorted_ts", sort_ts_udf(f.col("sales_ts")))

sorted_ts_df.show()


+-------+--------------------+--------------------+
|shop_id|            sales_ts|  sales_ts_after_udf|
+-------+--------------------+--------------------+
| SHOP_2|[[2018,5,50.0], [...|AFTER_UDF_[Row(ye...|
| SHOP_1|[[2018,5,500.0], ...|AFTER_UDF_[Row(ye...|
+-------+--------------------+--------------------+

root
 |-- shop_id: string (nullable = true)
 |-- sales_ts: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- year: integer (nullable = true)
 |    |    |-- week: integer (nullable = true)
 |    |    |-- sales: double (nullable = true)
 |-- sales_ts_after_udf: string (nullable = true)

+----------------+
| my_udf(shop_id)|
+----------------+
|AFTER_UDF_SHOP_1|
|AFTER_UDF_SHOP_1|
|AFTER_UDF_SHOP_1|
|AFTER_UDF_SHOP_2|
|AFTER_UDF_SHOP_2|
+----------------+

+-------+--------------------------------+----------------------------+
|shop_id|sales_ts                        |shop_id_splits              |
+-------+--------------------------------+------

+-------+--------------------+--------------------+
|shop_id|            sales_ts|           sorted_ts|
+-------+--------------------+--------------------+
| SHOP_2|[[2018,5,50.0], [...|[[2018,6,100.0], ...|
| SHOP_1|[[2018,5,500.0], ...|[[2018,6,200.0], ...|
+-------+--------------------+--------------------+

