In [2]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
        .appName("DataProfilingAndQualityPipeline")
        # Executor/driver configs
        .config("spark.executor.memory", "2g")
        .config("spark.driver.memory", "2g")
        .config("spark.executor.cores", "2")
        .config("spark.sql.shuffle.partitions", "8")  
        .config("spark.sql.adaptive.enabled", "true")
        .config("spark.sql.adaptive.skewJoin.enabled", "true")
        .config("spark.sql.adaptive.coalescePartitions.enabled", "true")
        .config("spark.sql.parquet.compression.codec", "snappy")
        .config("spark.sql.orc.impl", "native")
        .config("spark.sql.broadcastTimeout", "600")
        .config("spark.sql.execution.arrow.pyspark.enabled", "true")
        .getOrCreate()
)
spark

In [3]:
import re
from pyspark.sql import functions as F

input_path = "/opt/data/ncr_ride_bookings.csv"
output_path = "/data/processed/output.parquet"

def cleanColumnName(col_name):
    col_name = col_name.strip()

    col_name = re.sub(r"[.\s\-]+", "_", col_name)
    col_name = re.sub(r"[^0-9a-zA-Z_]", "", col_name)
    col_name = col_name.lower()
    col_name = re.sub(r"^_+|_+$", "", col_name)
    col_name = re.sub(r"_+", "_", col_name)
    
    return col_name

header = spark.sparkContext.textFile(input_path).first().split(",")
cleaned_headers = [cleanColumnName(h) for h in header]

df = spark.read.csv(input_path, header=True, inferSchema=True).toDF(*cleaned_headers)


In [3]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

input_path = "/opt/data/ncr_ride_bookings.csv"

# 1. Read raw (string only)
df = spark.read.csv(input_path, header=True, inferSchema=False)

# 2. Clean headers
def clean_col(colname: str) -> str:
    return colname.strip().lower().replace(" ", "_").replace(".", "_")

df = df.toDF(*[clean_col(c) for c in df.columns])

# 3. Clean string values (remove triple/double quotes + whitespace)
for c in df.columns:
    df = df.withColumn(
        c,
        F.regexp_replace(F.col(c), '^"+|"+$', '')  # remove leading/trailing quotes
    ).withColumn(
        c,
        F.trim(F.col(c))  # strip spaces
    )

row_count = df.count()

# --- Detect numeric vs categorical ---
numeric_cols, categorical_cols = [], []
for c in df.columns:
    tmp = df.withColumn("tmp", F.col(c).cast("double"))
    non_nulls = tmp.filter(F.col(c).isNotNull()).count()
    cast_success = tmp.filter(F.col("tmp").isNotNull()).count()
    if non_nulls > 0 and (cast_success / non_nulls) > 0.9:
        numeric_cols.append(c)
    else:
        categorical_cols.append(c)

# --- Profiling Report ---
report_rows = []

for c in df.columns:
    null_count = df.filter(F.col(c).isNull() | (F.trim(F.col(c)) == "")).count()
    null_pct = round((null_count / row_count) * 100, 2) if row_count else None
    distinct_count = df.select(c).distinct().count()

    # Skew ratio
    skew_ratio = round(distinct_count / row_count, 2) if row_count else None
    if skew_ratio is None:
        skew_level = "unknown"
    elif skew_ratio < 0.1:
        skew_level = "low"
    elif skew_ratio < 0.5:
        skew_level = "mid"
    else:
        skew_level = "high"

    # Defaults
    min_val = max_val = mean_val = stddev_val = None
    percentiles = None
    outliers = None
    top_values = None
    dtype = "numeric" if c in numeric_cols else "categorical"

    if c in numeric_cols:
        stats = df.select(
            F.min(F.col(c).cast("double")).alias("min"),
            F.max(F.col(c).cast("double")).alias("max"),
            F.mean(F.col(c).cast("double")).alias("mean"),
            F.stddev(F.col(c).cast("double")).alias("stddev")
        ).collect()[0]
        min_val, max_val, mean_val, stddev_val = stats

        # Percentiles / Histogram
        percentiles = df.select(F.col(c).cast("double").alias(c)) \
            .na.drop() \
            .approxQuantile(c, [0.25, 0.5, 0.75, 0.95, 0.99], 0.01)

        # Outliers = values beyond mean ± 3*stddev
        if mean_val is not None and stddev_val is not None:
            outliers = df.filter(
                (F.col(c).cast("double") > mean_val + 3 * stddev_val) |
                (F.col(c).cast("double") < mean_val - 3 * stddev_val)
            ).count()

    elif c in categorical_cols:
        # Top categorical values
        top_vals = df.groupBy(c).count().orderBy(F.desc("count")).limit(3).collect()
        top_values = [(row[c], row["count"]) for row in top_vals]

    report_rows.append((
        c, dtype, null_count, null_pct, distinct_count,
        skew_ratio, skew_level, min_val, max_val, mean_val, stddev_val,
        str(percentiles), outliers, str(top_values)
    ))

# 5. Convert to Spark DataFrame
report_df = spark.createDataFrame(
    report_rows,
    ["column_name", "data_type", "null_count", "null_pct",
     "distinct_count", "skew_ratio", "skew_level",
     "min_val", "max_val", "mean_val", "stddev_val",
     "percentiles", "outliers", "top_values"]
)

report_df.show(truncate=False)


+---------------------------------+-----------+----------+--------+--------------+----------+----------+-------+-------+-----------------+------------------+----------------------------+--------+----------------------------------------------------------------------------------------------+
|column_name                      |data_type  |null_count|null_pct|distinct_count|skew_ratio|skew_level|min_val|max_val|mean_val         |stddev_val        |percentiles                 |outliers|top_values                                                                                    |
+---------------------------------+-----------+----------+--------+--------------+----------+----------+-------+-------+-----------------+------------------+----------------------------+--------+----------------------------------------------------------------------------------------------+
|date                             |categorical|0         |0.0     |365           |0.0       |low       |NULL   |NULL   |NULL   