<h2> Imports & Configuration </h2>

In [0]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
import time

In [0]:
from pyspark.sql.types import IntegerType
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.master("local[4]").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [0]:
# spark.conf.set("spark.sql.shuffle.partitions", "3")
spark.conf.set("spark.sql.adaptive.enabled", "false")

<h2> Simulating Uniform Dataset </h2>

In [0]:
df_uniform = spark.range(1000000)
df_uniform.show(3, False)

In [0]:
(
    df_uniform
    .withColumn("partition", F.spark_partition_id())
    .groupBy("partition")
    .count()
    .orderBy("partition")
    .show()
)

<h2> Skewed Dataset </h2>

In [0]:
df0 = spark.range(0, 1000000).repartition(1)
df1 = spark.range(0, 10).repartition(1)
df2 = spark.range(0, 10).repartition(1)
df_skew = df0.union(df1).union(df2)
df_skew.show(3, False)

In [0]:
(
    df_skew
    .withColumn("partition", F.spark_partition_id())
    .groupBy("partition")
    .count()
    .orderBy("partition")
    .show()
)

# Join Skews

In [0]:
transactions_file = "../../data/data_skew/transactions.parquet"
customer_file = "../../data/data_skew/customers.parquet"

df_transactions = spark.read.parquet(transactions_file)
df_customers = spark.read.parquet(customer_file)

In [0]:
df_transactions.printSchema()
df_transactions.show(5, False)

In [0]:
df_customers.printSchema()
df_customers.show(5, False)

In [0]:
(
    df_transactions
    .groupBy("cust_id")
    .agg(F.countDistinct("txn_id").alias("ct"))
    .orderBy(F.desc("ct"))
    .show(5, False)
)

In [0]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [0]:
df_txn_details = (
    df_transactions.join(
        df_customers,
        on="cust_id",
        how="inner"
    )
)

In [0]:
start_time = time.time()
df_txn_details.count()
print(f"time taken: {time.time() - start_time}")

In [0]:
# spark.stop()