In [1]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("CSV Reads Optimization").master("local[*]").getOrCreate()

spark

In [2]:
input_path = "/opt/data/ncr_ride_bookings.csv"
df = spark.read.csv(input_path, header=True, inferSchema=False)
df.head()

Row(Date='2024-03-23', Time='12:29:38', Booking ID='"""CNR5884300"""', Booking Status='No Driver Found', Customer ID='"""CID1982111"""', Vehicle Type='eBike', Pickup Location='Palam Vihar', Drop Location='Jhilmil', Avg VTAT='null', Avg CTAT='null', Cancelled Rides by Customer='null', Reason for cancelling by Customer='null', Cancelled Rides by Driver='null', Driver Cancellation Reason='null', Incomplete Rides='null', Incomplete Rides Reason='null', Booking Value='null', Ride Distance='null', Driver Ratings='null', Customer Rating='null', Payment Method='null')

In [4]:
input_path = "/opt/data/ncr_ride_bookings.csv"
df = spark.read.csv(input_path, header=True, inferSchema=False)

def clean_col(colname:str) -> str:
    return colname.strip().lower().replace(" ", "_").replace(".", "_")
    
df = df.toDF(*[clean_col(c) for c in df.columns])

df = df.select([
    F.trim(F.regexp_replace(F.col(c), '^"+|"+$', '')).alias(c)
    for c in df.columns
])

df.cache()
row_count = df.count()

exprs = []
for c in df.columns:
    exprs.append(F.count(F.col(c)).alias(f"{c}_non_nulls"))
    exprs.append(
        F.sum(F.when(F.col(c).cast("double").isNotNull(), 1).otherwise(0))
        .alias(f"{c}_cast_success")
    )

stats = df.agg(*exprs).collect()[0]

numeric_cols, categorical_cols = [], []
for c in df.columns:
    non_nulls = stats[f"{c}_non_nulls"]
    cast_success = stats[f"{c}_cast_success"]

    if non_nulls > 0 and (cast_success / non_nulls) > 0.9:
        numeric_cols.append(c)
    else:
        categorical_cols.append(c)

# ---------------------------------------------------
# 4. Batch null & distinct stats
# ---------------------------------------------------
agg_exprs = []
for c in df.columns:
    agg_exprs.append(
        F.sum(F.when(F.col(c).isNull() | (F.trim(F.col(c)) == ""), 1).otherwise(0))
        .alias(f"{c}_nulls")
    )
    agg_exprs.append(F.approx_count_distinct(c).alias(f"{c}_distinct"))

nulls_distinct = df.agg(*agg_exprs).collect()[0].asDict()

# ---------------------------------------------------
# 5. Numeric statistics (pre-cast once)
# ---------------------------------------------------
df_numeric = df.select([F.col(c).cast("double").alias(c) for c in numeric_cols])

num_exprs = []
for c in numeric_cols:
    num_exprs += [
        F.min(c).alias(f"{c}_min"),
        F.max(c).alias(f"{c}_max"),
        F.mean(c).alias(f"{c}_mean"),
        F.stddev(c).alias(f"{c}_stddev")
    ]

numeric_stats = df_numeric.agg(*num_exprs).collect()[0].asDict()

# ---------------------------------------------------
# 6. Percentiles
# ---------------------------------------------------
percentiles_dict = {}
for c in numeric_cols:
    percentiles_dict[c] = df_numeric.na.drop().approxQuantile(
        c, [0.25, 0.5, 0.75, 0.95, 0.99], 0.01
    )

# ---------------------------------------------------
# ðŸ”§ CHANGED: Efficient top categorical values using stack
# ---------------------------------------------------
if categorical_cols:
    stack_expr = F.expr(
        "stack({0}, {1}) as (column_name, value)".format(
            len(categorical_cols),
            ", ".join([f"'{c}', {c}" for c in categorical_cols])
        )
    )

    cat_df = df.select(stack_expr)
    freq_df = cat_df.groupBy("column_name", "value").count()

    w = Window.partitionBy("column_name").orderBy(F.desc("count"))
    top_values_df = freq_df.withColumn("rank", F.row_number().over(w)) \
                           .filter(F.col("rank") <= 3)

    top_values = {}
    for r in top_values_df.collect():
        top_values.setdefault(r["column_name"], []).append(
            (r["value"], r["count"])
        )
else:
    top_values = {}

# ---------------------------------------------------
# ðŸ”§ CHANGED: Build final profiling report
# ---------------------------------------------------
report_rows = []

for c in df.columns:
    null_count = nulls_distinct[f"{c}_nulls"]
    null_pct = round((null_count / row_count) * 100, 2) if row_count else None
    distinct_count = nulls_distinct[f"{c}_distinct"]

    # ðŸ”§ CHANGED: Correct naming (this is cardinality, not skew)
    cardinality_ratio = round(distinct_count / row_count, 4) if row_count else None

    if cardinality_ratio is None:
        cardinality_level = "unknown"
    elif cardinality_ratio < 0.1:
        cardinality_level = "low"
    elif cardinality_ratio < 0.5:
        cardinality_level = "mid"
    else:
        cardinality_level = "high"

    dtype = "numeric" if c in numeric_cols else "categorical"

    min_val = max_val = mean_val = stddev_val = None
    percentiles = None
    outlier_risk = None
    top_vals = None

    if c in numeric_cols:
        min_val = numeric_stats.get(f"{c}_min")
        max_val = numeric_stats.get(f"{c}_max")
        mean_val = numeric_stats.get(f"{c}_mean")
        stddev_val = numeric_stats.get(f"{c}_stddev")
        percentiles = percentiles_dict.get(c)

        # ðŸ”§ CHANGED: Outlier RISK instead of expensive count
        if mean_val is not None and stddev_val is not None and stddev_val > 0:
            outlier_risk = "HIGH" if stddev_val > abs(mean_val) else "LOW"

    else:
        top_vals = top_values.get(c)

    # ðŸ†• ADDED: Data quality flag
    if null_pct is not None and null_pct > 30:
        quality_flag = "HIGH_NULLS"
    elif cardinality_level == "low":
        quality_flag = "LOW_VARIANCE"
    else:
        quality_flag = "OK"

    report_rows.append((
        c, dtype, null_count, null_pct,
        distinct_count, cardinality_ratio, cardinality_level,
        min_val, max_val, mean_val, stddev_val,
        str(percentiles), outlier_risk, str(top_vals),
        quality_flag
    ))

# ---------------------------------------------------
# 7. Final Spark DataFrame
# ---------------------------------------------------
report_df = spark.createDataFrame(
    report_rows,
    [
        "column_name", "data_type",
        "null_count", "null_pct",
        "distinct_count", "cardinality_ratio", "cardinality_level",
        "min_val", "max_val", "mean_val", "stddev_val",
        "percentiles", "outlier_risk", "top_values",
        "quality_flag"
    ]
)

report_df.show(100, truncate=False)


+---------------------------------+-----------+----------+--------+--------------+-----------------+-----------------+-------+-------+-----------------+------------------+----------------------------+------------+----------------------------------------------------------------------------------------------+------------+
|column_name                      |data_type  |null_count|null_pct|distinct_count|cardinality_ratio|cardinality_level|min_val|max_val|mean_val         |stddev_val        |percentiles                 |outlier_risk|top_values                                                                                    |quality_flag|
+---------------------------------+-----------+----------+--------+--------------+-----------------+-----------------+-------+-------+-----------------+------------------+----------------------------+------------+----------------------------------------------------------------------------------------------+------------+
|date                             

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 57556)
Traceback (most recent call last):
  File "/usr/lib/python3.12/socketserver.py", line 318, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.12/socketserver.py", line 349, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.12/socketserver.py", line 362, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.12/socketserver.py", line 761, in __init__
    self.handle()
  File "/opt/spark/python/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/opt/spark/python/pyspark/accumulators.py", line 267, in poll
    if self.rfile in r and func():
                           ^^^^^^
  File "/opt/spark/python/pyspark/accumulators.py", line 271, in accum_updates
    num_updates = read_int(self.rfile)
                  ^^

In [3]:
# input_path = "/opt/data/ncr_ride_bookings.csv"
# df = spark.read.csv(input_path, header=True, inferSchema=False)

def clean_col(colname: str) -> str:
    return colname.strip().lower().replace(" ", "_").replace(".", "_")

df = df.toDF(*[clean_col(c) for c in df.columns])

# ---------------------------------------------------
# 3. Clean string values (single select â†’ optimized)
# ---------------------------------------------------
df = df.select([
    F.trim(F.regexp_replace(F.col(c), '^"+|"+$', '')).alias(c)
    for c in df.columns
])

df.cache()
row_count = df.count()

# ---------------------------------------------------
# 4. Detect numeric vs categorical (efficient)
# ---------------------------------------------------
exprs = []
for c in df.columns:
    exprs.append(F.count(F.col(c)).alias(f"{c}_non_nulls"))
    exprs.append(
        F.sum(F.when(F.col(c).cast("double").isNotNull(), 1).otherwise(0))
        .alias(f"{c}_cast_success")
    )

stats = df.agg(*exprs).collect()[0]

numeric_cols, categorical_cols = [], []
for c in df.columns:
    non_nulls = stats[f"{c}_non_nulls"]
    cast_success = stats[f"{c}_cast_success"]

    if non_nulls > 0 and (cast_success / non_nulls) > 0.9:
        numeric_cols.append(c)
    else:
        categorical_cols.append(c)

# ---------------------------------------------------
# 5. Null & distinct stats (batch)
# ---------------------------------------------------
agg_exprs = []
for c in df.columns:
    agg_exprs.append(
        F.sum(F.when(F.col(c).isNull() | (F.trim(F.col(c)) == ""), 1).otherwise(0))
        .alias(f"{c}_nulls")
    )
    agg_exprs.append(F.approx_count_distinct(c).alias(f"{c}_distinct"))

nulls_distinct = df.agg(*agg_exprs).collect()[0].asDict()

# ---------------------------------------------------
# 6. Numeric statistics (pre-cast once)
# ---------------------------------------------------
df_numeric = df.select([F.col(c).cast("double").alias(c) for c in numeric_cols])

num_exprs = []
for c in numeric_cols:
    num_exprs += [
        F.min(c).alias(f"{c}_min"),
        F.max(c).alias(f"{c}_max"),
        F.mean(c).alias(f"{c}_mean"),
        F.stddev(c).alias(f"{c}_stddev")
    ]

numeric_stats = df_numeric.agg(*num_exprs).collect()[0].asDict()

# ---------------------------------------------------
# 7. Percentiles (used for skew + outliers)
# ---------------------------------------------------
percentiles_dict = {}
for c in numeric_cols:
    percentiles_dict[c] = df_numeric.na.drop().approxQuantile(
        c, [0.01, 0.25, 0.5, 0.75, 0.99], 0.01
    )

# ---------------------------------------------------
# 8. Top categorical values (optimized stack)
# ---------------------------------------------------
if categorical_cols:
    stack_expr = F.expr(
        "stack({0}, {1}) as (column_name, value)".format(
            len(categorical_cols),
            ", ".join([f"'{c}', {c}" for c in categorical_cols])
        )
    )

    cat_df = df.select(stack_expr)
    freq_df = cat_df.groupBy("column_name", "value").count()

    w = Window.partitionBy("column_name").orderBy(F.desc("count"))
    top_values_df = freq_df.withColumn("rank", F.row_number().over(w)) \
                           .filter(F.col("rank") <= 3)

    top_values = {}
    for r in top_values_df.collect():
        top_values.setdefault(r["column_name"], []).append(
            (r["value"], r["count"])
        )
else:
    top_values = {}

# ---------------------------------------------------
# ðŸ†• 9. SKEW LOGIC (Senior-level)
# ---------------------------------------------------
skew_info = {}

# ---- Categorical skew (hot key detection)
for c in categorical_cols:
    vals = top_values.get(c)
    if vals:
        top_count = vals[0][1]
        dominance_ratio = round(top_count / row_count, 4) if row_count else None

        if dominance_ratio is None:
            skew_label = "unknown"
        elif dominance_ratio > 0.9:
            skew_label = "HOT_KEY_RISK"
        elif dominance_ratio > 0.7:
            skew_label = "HIGH_SKEW"
        elif dominance_ratio > 0.3:
            skew_label = "SKEWED"
        else:
            skew_label = "BALANCED"

        skew_info[c] = (dominance_ratio, skew_label)

# ---- Numeric skew (distribution skew)
for c in numeric_cols:
    p = percentiles_dict.get(c)
    if p and len(p) == 5:
        p01, p25, p50, p75, p99 = p

        if (p50 - p01) != 0:
            skew_score = round((p99 - p50) / (p50 - p01), 3)

            if skew_score > 1.5:
                skew_label = "RIGHT_SKEWED"
            elif skew_score < 0.7:
                skew_label = "LEFT_SKEWED"
            else:
                skew_label = "SYMMETRIC"
        else:
            skew_score, skew_label = None, "unknown"

        skew_info[c] = (skew_score, skew_label)

# ---------------------------------------------------
# 10. Final profiling report
# ---------------------------------------------------
report_rows = []

for c in df.columns:
    null_count = nulls_distinct[f"{c}_nulls"]
    null_pct = round((null_count / row_count) * 100, 2) if row_count else None
    distinct_count = nulls_distinct[f"{c}_distinct"]

    cardinality_ratio = round(distinct_count / row_count, 4) if row_count else None

    if cardinality_ratio is None:
        cardinality_level = "unknown"
    elif cardinality_ratio < 0.1:
        cardinality_level = "low"
    elif cardinality_ratio < 0.5:
        cardinality_level = "mid"
    else:
        cardinality_level = "high"

    dtype = "numeric" if c in numeric_cols else "categorical"

    min_val = max_val = mean_val = stddev_val = None
    percentiles = None
    outlier_risk = None
    top_vals = None

    skew_score, skew_label = skew_info.get(c, (None, None))

    if c in numeric_cols:
        min_val = numeric_stats.get(f"{c}_min")
        max_val = numeric_stats.get(f"{c}_max")
        mean_val = numeric_stats.get(f"{c}_mean")
        stddev_val = numeric_stats.get(f"{c}_stddev")
        percentiles = percentiles_dict.get(c)

        if stddev_val is not None and mean_val is not None:
            outlier_risk = "HIGH" if stddev_val > abs(mean_val) else "LOW"

    else:
        top_vals = top_values.get(c)

    if null_pct is not None and null_pct > 30:
        quality_flag = "HIGH_NULLS"
    elif skew_label in ("HOT_KEY_RISK", "HIGH_SKEW"):
        quality_flag = "SKEW_RISK"
    elif cardinality_level == "low":
        quality_flag = "LOW_VARIANCE"
    else:
        quality_flag = "OK"

    report_rows.append((
        c, dtype, null_count, null_pct,
        distinct_count, cardinality_ratio, cardinality_level,
        skew_score, skew_label,
        min_val, max_val, mean_val, stddev_val,
        str(percentiles), outlier_risk, str(top_vals),
        quality_flag
    ))

# ---------------------------------------------------
# 11. Final Spark DataFrame
# ---------------------------------------------------
report_df = spark.createDataFrame(
    report_rows,
    [
        "column_name", "data_type",
        "null_count", "null_pct",
        "distinct_count", "cardinality_ratio", "cardinality_level",
        "skew_score", "skew_label",
        "min_val", "max_val", "mean_val", "stddev_val",
        "percentiles", "outlier_risk", "top_values",
        "quality_flag"
    ]
)

report_df.show(100, truncate=False)


+---------------------------------+-----------+----------+--------+--------------+-----------------+-----------------+----------+------------+-------+-------+-----------------+------------------+---------------------------+------------+----------------------------------------------------------------------------------------------+------------+
|column_name                      |data_type  |null_count|null_pct|distinct_count|cardinality_ratio|cardinality_level|skew_score|skew_label  |min_val|max_val|mean_val         |stddev_val        |percentiles                |outlier_risk|top_values                                                                                    |quality_flag|
+---------------------------------+-----------+----------+--------+--------------+-----------------+-----------------+----------+------------+-------+-------+-----------------+------------------+---------------------------+------------+--------------------------------------------------------------------------