In [0]:
dbutils.fs.cp("file:/data/input/bad_records.csv", "dbfs:/data/input/bad_records.csv")

Out[37]: True

In [0]:
import os

# Create the directory if it doesn't exist
os.makedirs("/data/input/", exist_ok=True)

# Sample data with a malformed row (bad_row has only one column)
data = """id,name,age
1,Alice,30
2,Bob,not_a_number
3,,25
4,Charlie,40
bad_row,extra,columns,here
5,David,
"""

# Save sample data to a file (this file will contain a malformed row)
with open("/data/input/bad_records.csv", "w") as f:
    f.write(data)

In [0]:
df = spark.read.option("header", True).option("inferSchema", True).csv("/data/input/bad_records.csv")
df.show()

+-------+-------+------------+
|     id|   name|         age|
+-------+-------+------------+
|      1|  Alice|          30|
|      2|    Bob|not_a_number|
|      3|   null|          25|
|      4|Charlie|          40|
|bad_row|   null|        null|
|      5|  David|        null|
+-------+-------+------------+



### Permissive Mode

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Define schema for reading (strict: expects 3 columns)
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True)
])


In [0]:

df1 = spark.read.option("header", True) \
    .option("mode", "PERMISSIVE") \
    .option("columnNameOfCorruptRecord", "_corrupt_record") \
    .schema(schema) \
    .csv("/data/input/bad_records.csv")

df1.show(truncate=False)



+----+-------+----+
|id  |name   |age |
+----+-------+----+
|1   |Alice  |30  |
|2   |Bob    |null|
|3   |null   |25  |
|4   |Charlie|40  |
|null|null   |null|
|5   |David  |null|
+----+-------+----+



### Dropmalformed Mode

In [0]:

df2 = spark.read.option("header", True) \
    .option("mode", "DROPMALFORMED") \
    .schema(schema) \
    .csv("/data/input/bad_records.csv")

df2.show(truncate=False)



+---+-------+----+
|id |name   |age |
+---+-------+----+
|1  |Alice  |30  |
|3  |null   |25  |
|4  |Charlie|40  |
|5  |David  |null|
+---+-------+----+



### Failfast Mode

In [0]:

df3 = spark.read.option("header", True) \
    .option("mode", "FAILFAST") \
    .schema(schema) \
    .csv("/data/input/bad_records.csv")

df3.show(truncate=False)



[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
File [0;32m<command-6336637188382304>:6[0m
[1;32m      1[0m df3 [38;5;241m=[39m spark[38;5;241m.[39mread[38;5;241m.[39moption([38;5;124m"[39m[38;5;124mheader[39m[38;5;124m"[39m, [38;5;28;01mTrue[39;00m) \
[1;32m      2[0m     [38;5;241m.[39moption([38;5;124m"[39m[38;5;124mmode[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124mFAILFAST[39m[38;5;124m"[39m) \
[1;32m      3[0m     [38;5;241m.[39mschema(schema) \
[1;32m      4[0m     [38;5;241m.[39mcsv([38;5;124m"[39m[38;5;124m/data/input/bad_records.csv[39m[38;5;124m"[39m)
[0;32m----> 6[0m df3[38;5;241m.[39mshow(truncate[38;5;241m=[39m[38;5;28;01mFalse[39;00m)

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     

> SUMMARY


**What to do:**
- If you want to see which row is bad, use `PERMISSIVE` mode with columnNameOfCorruptRecord.
- If you want Spark to skip bad rows, use `DROPMALFORMED` mode.
- `FAILFAST` is useful for strict validation, but it will always abort on the first malformed row. In `FAILFAST` mode, Spark aborts the job as soon as it finds such a row.