In [0]:
#Spark Session
from pyspark.sql import SparkSession
spark = (
        SparkSession
        .builder
        .appName("Reading from CSV files")
        .getOrCreate()
)

In [0]:
spark

In [0]:
# Read a CSV file into a dataframe
df = spark.read.csv("/data/input/emp.csv")

In [0]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)




Spark identified the metadata in the above code but not the header. So, we will use header option for that as shown below.

In [0]:
df = spark.read.option("header",True).csv("/data/input/emp.csv")

In [0]:
df.printSchema()

root
 |-- employee_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- hire_date: string (nullable = true)



In [0]:
df.show()

+-----------+-------------+-------------+---+------+------+----------+
|employee_id|department_id|         name|age|gender|salary| hire_date|
+-----------+-------------+-------------+---+------+------+----------+
|        017|          105|  George Wang| 34|  Male| 57000|2016-03-15|
|        018|          104|    Nancy Liu| 29|Female| 50000|2017-06-01|
|        019|          103|  Steven Chen| 36|  Male| 62000|2015-08-01|
|        020|          102|    Grace Kim| 32|Female| 53000|2018-11-01|
|        007|          101|James Johnson| 42|  Male| 70000|2012-03-15|
|        008|          102|     Kate Kim| 29|Female| 51000|2019-10-01|
|        009|          103|      Tom Tan| 33|  Male| 58000|2016-06-01|
|        010|          104|     Lisa Lee| 27|Female| 47000|2018-08-01|
|        015|          106|  Michael Lee| 37|  Male| 63000|2014-09-30|
|        016|          107|  Kelly Zhang| 30|Female| 49000|2018-04-01|
|        011|          104|   David Park| 38|  Male| 65000|2015-11-01|
|     

In [0]:
# Reading with Schema
_schema = "employee_id int, department_id int, name string, age int, gender string, salary double, hire_date date"
df_schema = spark.read.schema(_schema).option("header",True).csv("/data/input/emp.csv")


In [0]:
df_schema.show()

+-----------+-------------+-------------+---+------+-------+----------+
|employee_id|department_id|         name|age|gender| salary| hire_date|
+-----------+-------------+-------------+---+------+-------+----------+
|         17|          105|  George Wang| 34|  Male|57000.0|2016-03-15|
|         18|          104|    Nancy Liu| 29|Female|50000.0|2017-06-01|
|         19|          103|  Steven Chen| 36|  Male|62000.0|2015-08-01|
|         20|          102|    Grace Kim| 32|Female|53000.0|2018-11-01|
|          7|          101|James Johnson| 42|  Male|70000.0|2012-03-15|
|          8|          102|     Kate Kim| 29|Female|51000.0|2019-10-01|
|          9|          103|      Tom Tan| 33|  Male|58000.0|2016-06-01|
|         10|          104|     Lisa Lee| 27|Female|47000.0|2018-08-01|
|         15|          106|  Michael Lee| 37|  Male|63000.0|2014-09-30|
|         16|          107|  Kelly Zhang| 30|Female|49000.0|2018-04-01|
|         11|          104|   David Park| 38|  Male|65000.0|2015

In [0]:

_schema = "employee_id int, department_id int, name string, age int, gender string, salary double, hire_date date"
df_new = spark.read.schema(_schema).option("header",True).csv("/data/input/emp_new.csv")

In [0]:
df_new.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- department_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- hire_date: date (nullable = true)



In [0]:
df_new.show()

+-----------+-------------+-------------+---+------+-------+----------+
|employee_id|department_id|         name|age|gender| salary| hire_date|
+-----------+-------------+-------------+---+------+-------+----------+
|         17|          105|  George Wang| 34|  Male|57000.0|2016-03-15|
|         18|          104|    Nancy Liu| 29|Female|50000.0|2017-06-01|
|         19|          103|  Steven Chen| 36|  Male|62000.0|2015-08-01|
|         20|          102|    Grace Kim| 32|Female|53000.0|2018-11-01|
|          7|          101|James Johnson| 42|  Male|   null|2012-03-15|
|          8|          102|     Kate Kim| 29|Female|51000.0|2019-10-01|
|          9|          103|      Tom Tan| 33|  Male|58000.0|2016-06-01|
|         10|          104|     Lisa Lee| 27|Female|47000.0|2018-08-01|
|         15|          106|  Michael Lee| 37|  Male|63000.0|2014-09-30|
|         16|          107|  Kelly Zhang| 30|Female|49000.0|2018-04-01|
|          1|          101|     John Doe| 30|  Male|50000.0|2015

In the above result, 'James Johnson's Salary is shown as 'null' (It is 'LOW' in the csv file which is a corrupt data.) To identify that we will use permissive mode as shown below

In [0]:
# Handle BAD record - PERMISSIVE mode (default mode)
_schema = "employee_id int, department_id int, name string, age int, gender string, salary double, hire_date date, _corrupt_record string"
df_permissive = spark.read.schema(_schema).option("header",True).csv("/data/input/emp_new.csv")

In [0]:
df_permissive.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- department_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- hire_date: date (nullable = true)
 |-- _corrupt_record: string (nullable = true)



In [0]:
df_permissive.show()

+-----------+-------------+-------------+---+------+-------+----------+--------------------+
|employee_id|department_id|         name|age|gender| salary| hire_date|     _corrupt_record|
+-----------+-------------+-------------+---+------+-------+----------+--------------------+
|         17|          105|  George Wang| 34|  Male|57000.0|2016-03-15|                null|
|         18|          104|    Nancy Liu| 29|Female|50000.0|2017-06-01|                null|
|         19|          103|  Steven Chen| 36|  Male|62000.0|2015-08-01|                null|
|         20|          102|    Grace Kim| 32|Female|53000.0|2018-11-01|                null|
|          7|          101|James Johnson| 42|  Male|   null|2012-03-15|007,101,James Joh...|
|          8|          102|     Kate Kim| 29|Female|51000.0|2019-10-01|                null|
|          9|          103|      Tom Tan| 33|  Male|58000.0|2016-06-01|                null|
|         10|          104|     Lisa Lee| 27|Female|47000.0|2018-08-01

Now, as seen above, it has shown two corrupt records, every other record showing as null is not a corrupt record.

In [0]:
df_permissive.where("_corrupt_record is not null").show()

+-----------+-------------+-------------+---+------+-------+----------+--------------------+
|employee_id|department_id|         name|age|gender| salary| hire_date|     _corrupt_record|
+-----------+-------------+-------------+---+------+-------+----------+--------------------+
|          7|          101|James Johnson| 42|  Male|   null|2012-03-15|007,101,James Joh...|
|         11|          104|   David Park| 38|  Male|65000.0|      null|011,104,David Par...|
+-----------+-------------+-------------+---+------+-------+----------+--------------------+



- It is not necessary to add a new Spark inbuilt column '_corrupt_record'. 
- Instead, Spark provides an option 'columnNameOfCurrentRecord'
- Let's incorporate this below

In [0]:
_schema = "employee_id int, department_id int, name string, age int, gender string, salary double, hire_date date, badRecord string"
df_permissive = spark.read.schema(_schema).option("header",True).option("columnNameOfCurrentRecord", 'badRecord').csv("/data/input/emp_new.csv")

In [0]:
df_permissive.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- department_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- hire_date: date (nullable = true)
 |-- badRecord: string (nullable = true)



In [0]:
df_permissive.show()

+-----------+-------------+-------------+---+------+-------+----------+---------+
|employee_id|department_id|         name|age|gender| salary| hire_date|badRecord|
+-----------+-------------+-------------+---+------+-------+----------+---------+
|         17|          105|  George Wang| 34|  Male|57000.0|2016-03-15|     null|
|         18|          104|    Nancy Liu| 29|Female|50000.0|2017-06-01|     null|
|         19|          103|  Steven Chen| 36|  Male|62000.0|2015-08-01|     null|
|         20|          102|    Grace Kim| 32|Female|53000.0|2018-11-01|     null|
|          7|          101|James Johnson| 42|  Male|   null|2012-03-15|     null|
|          8|          102|     Kate Kim| 29|Female|51000.0|2019-10-01|     null|
|          9|          103|      Tom Tan| 33|  Male|58000.0|2016-06-01|     null|
|         10|          104|     Lisa Lee| 27|Female|47000.0|2018-08-01|     null|
|         15|          106|  Michael Lee| 37|  Male|63000.0|2014-09-30|     null|
|         16|   

In [0]:
# HANDLE BAD records - DROPMALFORMED
_schema = "employee_id int, department_id int, name string, age int, gender string, salary double, hire_date date"
df_m = spark.read.schema(_schema).option("mode", "DROPMALFORMED").option("header",True).csv("/data/input/emp_new.csv")

In [0]:
df_m.show()
# The corrupt records are dropped 

+-----------+-------------+-----------+---+------+-------+----------+
|employee_id|department_id|       name|age|gender| salary| hire_date|
+-----------+-------------+-----------+---+------+-------+----------+
|         17|          105|George Wang| 34|  Male|57000.0|2016-03-15|
|         18|          104|  Nancy Liu| 29|Female|50000.0|2017-06-01|
|         19|          103|Steven Chen| 36|  Male|62000.0|2015-08-01|
|         20|          102|  Grace Kim| 32|Female|53000.0|2018-11-01|
|          8|          102|   Kate Kim| 29|Female|51000.0|2019-10-01|
|          9|          103|    Tom Tan| 33|  Male|58000.0|2016-06-01|
|         10|          104|   Lisa Lee| 27|Female|47000.0|2018-08-01|
|         15|          106|Michael Lee| 37|  Male|63000.0|2014-09-30|
|         16|          107|Kelly Zhang| 30|Female|49000.0|2018-04-01|
|          1|          101|   John Doe| 30|  Male|50000.0|2015-01-01|
|          2|          101| Jane Smith| 25|Female|45000.0|2016-02-15|
|          3|       

In [0]:
# HANDLE BAD RECORDS - FAILFAST
_schema = "employee_id int, department_id int, name string, age int, gender string, salary double, hire_date date"
df_m = spark.read.schema(_schema).option("mode", "FAILFAST").option("header",True).csv("/data/input/emp_new.csv")

In [0]:
df_m.show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
File [0;32m<command-4095422948652140>:1[0m
[0;32m----> 1[0m [43mdf_m[49m[38;5;241;43m.[39;49m[43mshow[49m[43m([49m[43m)[49m

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m---> 48[0m     res [38;5;241m=[39m [43mfunc[49m[43m([49m[38;5;241;43m*[39;49m[43margs[49m[43m,[49m[43m [49m[38;5;241;43m*[39;49m[38;5;241;43m*[39;49m[43mkwargs[49m[43m)[49m
[1;32m     49[0m     logger[38;5;241m.[39mlog_success(
[1;32m     50[0m         module_name, class_name, function_name, time[38;5;241m.[39mperf_counter() [38;5;241m-[39m start, signature
[1;32m     51

Since, we have BAD RECORDS in emp_new.csv file, FAILFAST failed the job as soon as a BAD RECORD was hit as shown above

In [0]:
# BONUS TIp
# Writing multiple options using dictionary

options = {
    "header": True,
    "inferSchema": True,
    "mode": "DROPMALFORMED"
}
spark.read.schema(_schema).options(**options).csv("/data/input/emp.csv").show()

+-----------+-------------+-------------+---+------+-------+----------+
|employee_id|department_id|         name|age|gender| salary| hire_date|
+-----------+-------------+-------------+---+------+-------+----------+
|         17|          105|  George Wang| 34|  Male|57000.0|2016-03-15|
|         18|          104|    Nancy Liu| 29|Female|50000.0|2017-06-01|
|         19|          103|  Steven Chen| 36|  Male|62000.0|2015-08-01|
|         20|          102|    Grace Kim| 32|Female|53000.0|2018-11-01|
|          7|          101|James Johnson| 42|  Male|70000.0|2012-03-15|
|          8|          102|     Kate Kim| 29|Female|51000.0|2019-10-01|
|          9|          103|      Tom Tan| 33|  Male|58000.0|2016-06-01|
|         10|          104|     Lisa Lee| 27|Female|47000.0|2018-08-01|
|         15|          106|  Michael Lee| 37|  Male|63000.0|2014-09-30|
|         16|          107|  Kelly Zhang| 30|Female|49000.0|2018-04-01|
|         11|          104|   David Park| 38|  Male|65000.0|2015

options(**options) # this will fail with anything less of more than two *