In [None]:
# Install required Python packages (only if not already installed)
!pip install pyspark pandas matplotlib seaborn

In [None]:
from skewbalancer import ValueSkewBalancer, auto_balance_skew

In [None]:
from pyspark.sql import SparkSession

# Start a local Spark session
spark = SparkSession.builder \
    .appName("SkewBalancer Test") \
    .master("local[*]") \
    .getOrCreate()

In [None]:
# Replace this path with your own CSV
input_file = r"YOUR FILE HERE"
df = spark.read.csv(input_file, header=True, inferSchema=True)

# Show schema and preview
df.printSchema()
df.show(5)

In [None]:
# Automatically detect skew and apply salting + repartitioning
df_balanced = auto_balance_skew(df, output_dir="outputs", partitions=8, verbose=True)

In [None]:
# Show a few records
df_balanced.select("salted_key").show(5)

# Show partition sizes
ValueSkewBalancer.show_partition_sizes(df_balanced, label="Salted Result")

In [None]:
# Manual timing and explain
print("[Original Plan]")
ValueSkewBalancer.timeit(lambda: df.groupBy("Product Position").count().show())
ValueSkewBalancer.log_explain(df.groupBy("Product Position").count(), "outputs/original_plan.txt")

print("[Salted Plan]")
ValueSkewBalancer.timeit(lambda: df_balanced.groupBy("Product Position").count().show())
ValueSkewBalancer.log_explain(df_balanced.groupBy("Product Position").count(), "outputs/salted_plan.txt")

In [None]:
from IPython.display import Image, display

# Display generated visualizations
display(Image(filename="outputs/z_score_comparison_Revenue.png"))
display(Image(filename="outputs/box_plot_comparison_Revenue.png"))
display(Image(filename="outputs/histogram_comparison_Revenue.png"))