In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.getOrCreate()

In [4]:
df = spark.read.csv("../Dataset/skewed_sales_dataset.csv",header=True)

In [5]:
df.show(5)

+-------+----------+--------+
|user_id|product_id|sale_amt|
+-------+----------+--------+
|   u231|     p0390|     234|
|   u759|     p0310|     354|
|   u001|     p0229|     311|
|   u553|     p0103|     352|
|   u965|     p0493|     357|
+-------+----------+--------+
only showing top 5 rows


In [6]:
df.count()

1000000

In [19]:
from pyspark.sql.functions import col, concat, lit, rand, floor, explode, array, concat_ws

In [8]:
df.groupBy('user_id').count().orderBy(col("count").desc()).show()

+-------+------+
|user_id| count|
+-------+------+
|   u001|500000|
|   u734|   576|
|   u656|   571|
|   u925|   563|
|   u937|   560|
|   u726|   558|
|   u134|   557|
|   u286|   556|
|   u795|   555|
|   u899|   551|
|   u850|   551|
|   u625|   551|
|   u377|   550|
|   u163|   549|
|   u571|   549|
|   u038|   549|
|   u515|   548|
|   u947|   548|
|   u020|   548|
|   u388|   548|
+-------+------+
only showing top 20 rows


Data is skewed with 50% records for u001

In [9]:
user_df = df.select("user_id").distinct()

In [10]:
user_df.show(5)

+-------+
|user_id|
+-------+
|   u897|
|   u633|
|   u483|
|   u631|
|   u403|
+-------+
only showing top 5 rows


In [11]:
user_df = user_df.withColumn("Name", concat(lit("Salesman "), user_df["user_id"]))

In [12]:
user_df.show()

+-------+-------------+
|user_id|         Name|
+-------+-------------+
|   u897|Salesman u897|
|   u633|Salesman u633|
|   u483|Salesman u483|
|   u631|Salesman u631|
|   u403|Salesman u403|
|   u311|Salesman u311|
|   u462|Salesman u462|
|   u357|Salesman u357|
|   u716|Salesman u716|
|   u130|Salesman u130|
|   u336|Salesman u336|
|   u500|Salesman u500|
|   u416|Salesman u416|
|   u077|Salesman u077|
|   u333|Salesman u333|
|   u533|Salesman u533|
|   u205|Salesman u205|
|   u316|Salesman u316|
|   u042|Salesman u042|
|   u455|Salesman u455|
+-------+-------------+
only showing top 20 rows


In [13]:
user_df.count()

999

Broadcast Join might not be feasible based on data volume. We need to optimize the join by using salting

Salting

In [16]:
SALT_BUCKETS = 5
df_large_salted = df.withColumn("salt", floor(rand() * SALT_BUCKETS))

In [17]:
df_large_salted.show(5)

+-------+----------+--------+----+
|user_id|product_id|sale_amt|salt|
+-------+----------+--------+----+
|   u231|     p0390|     234|   3|
|   u759|     p0310|     354|   2|
|   u001|     p0229|     311|   2|
|   u553|     p0103|     352|   0|
|   u965|     p0493|     357|   2|
+-------+----------+--------+----+
only showing top 5 rows


In [21]:
df_large_salted.createOrReplaceTempView("TempView")

In [23]:
spark.sql('select * from TempView').show()

+-------+----------+--------+----+
|user_id|product_id|sale_amt|salt|
+-------+----------+--------+----+
|   u231|     p0390|     234|   3|
|   u759|     p0310|     354|   2|
|   u001|     p0229|     311|   2|
|   u553|     p0103|     352|   0|
|   u965|     p0493|     357|   2|
|   u001|     p0367|     178|   4|
|   u001|     p0377|      92|   2|
|   u667|     p0339|     440|   4|
|   u001|     p0083|      84|   4|
|   u813|     p0308|      51|   0|
|   u001|     p0406|      51|   4|
|   u001|     p0383|     461|   1|
|   u001|     p0303|      66|   1|
|   u222|     p0412|     123|   4|
|   u176|     p0361|     294|   3|
|   u345|     p0241|     432|   1|
|   u001|     p0484|      18|   3|
|   u001|     p0003|      99|   4|
|   u101|     p0492|     413|   1|
|   u001|     p0345|     119|   4|
+-------+----------+--------+----+
only showing top 20 rows


In [24]:
spark.stop()