In [None]:
# Install required Python packages (only if not already installed)
!pip install pyspark pandas matplotlib seaborn

In [None]:
from skewbalancer import ValueSkewBalancer, auto_balance_skew

In [None]:
from pyspark.sql import SparkSession

# Start a local Spark session
spark = SparkSession.builder \
    .appName("SkewBalancer Test") \
    .master("local[*]") \
    .getOrCreate()

In [None]:
# Replace this path with your own CSV
input_file = r""
df = spark.read.csv(input_file, header=True, inferSchema=True)

# Show schema and preview
df.printSchema()
df.show(5)

In [None]:
# Automatically detect skew and apply salting + repartitioning
df_balanced = auto_balance_skew(df, output_dir="outputs", partitions=8, verbose=True)

In [None]:
# Show a few records
df_balanced.select("salted_key").show(5)

# Show partition sizes
ValueSkewBalancer.show_partition_sizes(df_balanced, label="Salted Result")

In [None]:
# Define the groupBy column here, that was displayed to you
groupby_col: str = "type"

# Define the output log directory (o = Original, s = Salted)
o_log_output_dir: str = r"outputs/logs/original-plan.txt"
s_log_output_dir: str = r"outputs/logs/salted-plan.txt"

# Manual timing and explain
print("[Original Plan]")
ValueSkewBalancer.timeit(lambda: df.groupBy(groupby_col).count().show())
ValueSkewBalancer.log_explain(df.groupBy(groupby_col).count(), o_log_output_dir)

print("[Salted Plan]")
ValueSkewBalancer.timeit(lambda: df_balanced.groupBy(groupby_col).count().show())
ValueSkewBalancer.log_explain(df_balanced.groupBy(groupby_col).count(), s_log_output_dir)

In [None]:
from IPython.display import Image, display

# Display generated visualizations
display(Image(filename="outputs/z_score_comparison_baths.png"))
display(Image(filename="outputs/box_plot_baths.png"))
display(Image(filename="outputs/histogram_baths.png"))