In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
        .appName("DataProfilingAndQualityPipeline")
        # Executor/driver configs
        .config("spark.executor.memory", "2g")
        .config("spark.driver.memory", "2g")
        .config("spark.executor.cores", "2")
        .config("spark.sql.shuffle.partitions", "8")  
        .config("spark.sql.adaptive.enabled", "true")
        .config("spark.sql.adaptive.skewJoin.enabled", "true")
        .config("spark.sql.adaptive.coalescePartitions.enabled", "true")
        .config("spark.sql.parquet.compression.codec", "snappy")
        .config("spark.sql.orc.impl", "native")
        .config("spark.sql.broadcastTimeout", "600")
        .config("spark.sql.execution.arrow.pyspark.enabled", "true")
        .getOrCreate()
)
spark

In [3]:
import re
from pyspark.sql import functions as F

input_path = "/opt/data/ncr_ride_bookings.csv"
output_path = "/data/processed/output.parquet"

def cleanColumnName(col_name):
    col_name = col_name.strip()

    col_name = re.sub(r"[.\s\-]+", "_", col_name)
    col_name = re.sub(r"[^0-9a-zA-Z_]", "", col_name)
    col_name = col_name.lower()
    col_name = re.sub(r"^_+|_+$", "", col_name)
    col_name = re.sub(r"_+", "_", col_name)
    
    return col_name

header = spark.sparkContext.textFile(input_path).first().split(",")
cleaned_headers = [cleanColumnName(h) for h in header]

df = spark.read.csv(input_path, header=True, inferSchema=True).toDF(*cleaned_headers)


In [4]:
total_rows = df.count()
report_rows = []

for col in df.columns:
    col_dtype = dict(df.dtypes)[col]
    
    # Null count
    null_count = df.filter(F.col(col).isNull() | (F.col(col) == "")).count()
    null_pct = round((null_count / total_rows) * 100, 2) if total_rows > 0 else 0.0
    
    # Distinct count
    distinct_count = df.select(col).distinct().count()
    
    # Skew metric
    skew_ratio = distinct_count / total_rows if total_rows > 0 else 0.0
    if skew_ratio < 0.3:
        skew_level = "HIGH"
    elif skew_ratio < 0.7:
        skew_level = "MEDIUM"
    else:
        skew_level = "LOW"
    
    # Numeric stats
    min_val, max_val, mean_val, stddev_val = (None, None, None, None)
    if col_dtype in ["int", "double", "float", "bigint", "decimal"]:
        stats = df.select(
            F.min(col).alias("min"),
            F.max(col).alias("max"),
            F.mean(col).alias("mean"),
            F.stddev(col).alias("stddev")
        ).first()
        min_val, max_val, mean_val, stddev_val = stats
    
    # Top-N (categorical) -> show top 3 most frequent values
    top_values = None
    if col_dtype == "string":
        top_vals = (
            df.groupBy(col).count()
              .orderBy(F.desc("count"))
              .limit(3)
              .toPandas()
              .to_dict(orient="records")
        )
        top_values = str(top_vals)  # store as string
    
    report_rows.append((
        col, col_dtype, null_count, null_pct,
        distinct_count, skew_ratio, skew_level,
        min_val, max_val, mean_val, stddev_val, top_values
    ))

# Convert to DataFrame
report_df = spark.createDataFrame(
    report_rows,
    ["column_name", "data_type", "null_count", "null_pct",
     "distinct_count", "skew_ratio", "skew_level",
     "min_val", "max_val", "mean_val", "stddev_val", "top_values"]
)

# -----------------------
# Step 4: Duplicate Rows Check
# -----------------------
duplicate_count = total_rows - df.dropDuplicates().count()
print(f"Duplicate Rows: {duplicate_count}")

# -----------------------
# Step 5: Save Outputs
# -----------------------
df.write.mode("overwrite").parquet(output_path)
report_df.show(truncate=False)

PySparkValueError: [CANNOT_DETERMINE_TYPE] Some of types cannot be determined after inferring.

In [None]:
# from pyspark.sql import functions as F
# from pyspark.sql import types as T

# input_path = "path/to/your/file.csv"

# # 1. Read all columns as STRING
# df = spark.read.csv(input_path, header=True, inferSchema=False)

# # 2. Clean headers (replace spaces, dots, etc.)
# def clean_col(colname: str) -> str:
#     return colname.strip().lower().replace(" ", "_").replace(".", "_")

# df = df.toDF(*[clean_col(c) for c in df.columns])

# # 3. Auto-detect numeric columns
# numeric_cols = []
# categorical_cols = []

# for c in df.columns:
#     # try casting to double
#     test_col = df.withColumn(c + "_cast", F.col(c).cast("double"))
#     non_nulls = test_col.filter(F.col(c).isNotNull()).count()
#     cast_success = test_col.filter(F.col(c + "_cast").isNotNull()).count()
    
#     if non_nulls > 0 and (cast_success / non_nulls) > 0.9:  # 90% values are numeric
#         numeric_cols.append(c)
#     else:
#         categorical_cols.append(c)

# print("Numeric Columns:", numeric_cols)
# print("Categorical Columns:", categorical_cols)

# # 4. Profiling

# ## --- Nulls ---
# null_report = df.select([
#     (F.sum(F.when(F.col(c).isNull() | (F.col(c) == ""), 1).otherwise(0))
#      .alias(c + "_nulls"))
#     for c in df.columns
# ])
# null_report.show(truncate=False)

# ## --- Numeric profiling ---
# for c in numeric_cols:
#     df.select(
#         F.min(F.col(c).cast("double")).alias(c + "_min"),
#         F.max(F.col(c).cast("double")).alias(c + "_max"),
#         F.mean(F.col(c).cast("double")).alias(c + "_mean"),
#         F.stddev(F.col(c).cast("double")).alias(c + "_stddev")
#     ).show()

# ## --- Categorical profiling ---
# for c in categorical_cols:
#     df.groupBy(c).count().orderBy(F.desc("count")).show(5, truncate=False)  # top 5 values
