In [None]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("CSV Reads Optimization").master("local[*]").getOrCreate()

spark

In [None]:
input_path = "/opt/data/ncr_ride_bookings.csv"
df = spark.read.csv(input_path, header=True, inferSchema=False)
df.head()

In [None]:
def clean_col(colname: str) -> str:
    return colname.strip().lower().replace(" ", "_").replace(".", "_")

df = df.toDF(*[clean_col(c) for c in df.columns])

In [None]:
df = df.select([
    F.trim(F.regexp_replace(F.col(c), '^"+|"+$', '')).alias(c)
    for c in df.columns
])

df.cache()
row_count = df.count()


In [None]:
exprs = []
for c in df.columns:
    exprs.append(F.count(F.col(c)).alias(f"{c}_non_nulls"))
    exprs.append(
        F.sum(F.when(F.col(c).cast("double").isNotNull(), 1).otherwise(0))
        .alias(f"{c}_cast_success")
    )

stats = df.agg(*exprs).collect()[0]

numeric_cols, categorical_cols = [], []
for c in df.columns:
    non_nulls = stats[f"{c}_non_nulls"]
    cast_success = stats[f"{c}_cast_success"]

    if non_nulls > 0 and (cast_success / non_nulls) > 0.9:
        numeric_cols.append(c)
    else:
        categorical_cols.append(c)