Merge 2 dataframes with uneven columns

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Merge').getOrCreate()

In [20]:
from pyspark.sql.functions import lit

input1DF = spark.read.format('csv').option('header',True).load('sample_data\input1.csv')
input1DF.show()
input2DF = spark.read.format('csv').option('header',True).load('sample_data\input2.csv')
input2DF.show()

input1AddDF = input1DF.withColumn('Gender',lit(None))
input1AddDF.show()

result = input1AddDF.union(input2DF)
result.show()

+--------+---+
|    Name|Age|
+--------+---+
| Monisha| 23|
|  Arvind| 24|
|Rishitha| 24|
|  Anusha| 24|
| Gayatri| 25|
+--------+---+

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
| Monisha| 23|     F|
|  Arvind| 24|     M|
|Rishitha| 24|     F|
+--------+---+------+

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
| Monisha| 23|  null|
|  Arvind| 24|  null|
|Rishitha| 24|  null|
|  Anusha| 24|  null|
| Gayatri| 25|  null|
+--------+---+------+

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
| Monisha| 23|  null|
|  Arvind| 24|  null|
|Rishitha| 24|  null|
|  Anusha| 24|  null|
| Gayatri| 25|  null|
| Monisha| 23|     F|
|  Arvind| 24|     M|
|Rishitha| 24|     F|
+--------+---+------+



Another method

In [21]:
from pyspark.sql.types import *

schema = StructType(
    [
    StructField('Name',StringType(),True), # 3rd option is nullable is true or not
    StructField('Age',IntegerType(),True),
    StructField('Gender',StringType(),True),
    ]
)

input1DF = spark.read.format('csv').option('header',True).schema(schema).load('sample_data\input1.csv')
input1DF.show()
input2DF = spark.read.option('header',True).csv('sample_data\input2.csv',schema=schema)
input2DF.show()

result = input1DF.union(input2DF)
result.show()



+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
| Monisha| 23|  null|
|  Arvind| 24|  null|
|Rishitha| 24|  null|
|  Anusha| 24|  null|
| Gayatri| 25|  null|
+--------+---+------+

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
| Monisha| 23|     F|
|  Arvind| 24|     M|
|Rishitha| 24|     F|
+--------+---+------+

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
| Monisha| 23|  null|
|  Arvind| 24|  null|
|Rishitha| 24|  null|
|  Anusha| 24|  null|
| Gayatri| 25|  null|
| Monisha| 23|     F|
|  Arvind| 24|     M|
|Rishitha| 24|     F|
+--------+---+------+



Another method

In [24]:
input1DF = spark.read.format('csv').option('header',True).load('sample_data\input1.csv')
input1DF.show()
input2DF = spark.read.format('csv').option('header',True).load('sample_data\input2.csv')
input2DF.show()

result = input1DF.join(input2DF,on=['Name','Age'],how='outer')
result.show()

result = input1DF.join(input2DF,[input1DF.Name==input2DF.Name,input1DF.Age==input2DF.Age],how='outer').select(input1DF.Name,input1DF.Age,input2DF.Gender)
result.show()


+--------+---+
|    Name|Age|
+--------+---+
| Monisha| 23|
|  Arvind| 24|
|Rishitha| 24|
|  Anusha| 24|
| Gayatri| 25|
+--------+---+

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
| Monisha| 23|     F|
|  Arvind| 24|     M|
|Rishitha| 24|     F|
+--------+---+------+

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
|  Anusha| 24|  null|
|  Arvind| 24|     M|
| Gayatri| 25|  null|
| Monisha| 23|     F|
|Rishitha| 24|     F|
+--------+---+------+

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
|  Anusha| 24|  null|
|  Arvind| 24|     M|
| Gayatri| 25|  null|
| Monisha| 23|     F|
|Rishitha| 24|     F|
+--------+---+------+



Best approach - Automated

In [25]:
input1DF = spark.read.format('csv').option('header',True).load('sample_data\input1.csv')
input1DF.show()
input2DF = spark.read.format('csv').option('header',True).load('sample_data\input2.csv')
input2DF.show()

listA = set(input1DF.columns)-set(input2DF.columns)
listB = set(input2DF.columns)-set(input1DF.columns)

for i in listA:
    input2DF = input2DF.withColumn(i,lit(None))

for i in listB:
    input1DF = input1DF.withColumn(i,lit(None))
    
resut = input1DF.union(input2DF)

result.show()

+--------+---+
|    Name|Age|
+--------+---+
| Monisha| 23|
|  Arvind| 24|
|Rishitha| 24|
|  Anusha| 24|
| Gayatri| 25|
+--------+---+

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
| Monisha| 23|     F|
|  Arvind| 24|     M|
|Rishitha| 24|     F|
+--------+---+------+

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
|  Anusha| 24|  null|
|  Arvind| 24|     M|
| Gayatri| 25|  null|
| Monisha| 23|     F|
|Rishitha| 24|     F|
+--------+---+------+



Apply line break every 5th occurance from | delimited input.txt

In [49]:
from pyspark.sql.functions import regexp_replace,explode,split

input = spark.read.csv('sample_data\input.txt')

input.show(truncate=False)

input = input.withColumn("chk",regexp_replace("_c0","(.*?\\|){4}","$0-"))

input.show(truncate=False)

input = input.withColumn('col_explode',explode(split('chk','\|-')))

input.select(input.col_explode).show()

result = input.select(input.col_explode)

result.show()




+--------------------------------------------------------------------------------------------------------+
|_c0                                                                                                     |
+--------------------------------------------------------------------------------------------------------+
|A|BE|1|25|B|BSC|2|27|A|BE|1|25|B|BSC|2|27|A|BE|1|25|B|BSC|2|27|A|BE|1|25|B|BSC|2|27|A|BE|1|25|B|BSC|2|27|
+--------------------------------------------------------------------------------------------------------+

+--------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------+
|_c0                                                                                                     |chk                                                                                                              |
+---------------------

In [55]:
result.select(split('col_explode','\|')).show()

result.rdd.map(lambda i:i).collect()




+--------------------------+
|split(col_explode, \|, -1)|
+--------------------------+
|            [A, BE, 1, 25]|
|           [B, BSC, 2, 27]|
|            [A, BE, 1, 25]|
|           [B, BSC, 2, 27]|
|            [A, BE, 1, 25]|
|           [B, BSC, 2, 27]|
|            [A, BE, 1, 25]|
|           [B, BSC, 2, 27]|
|            [A, BE, 1, 25]|
|           [B, BSC, 2, 27]|
+--------------------------+



[Row(col_explode='A|BE|1|25'),
 Row(col_explode='B|BSC|2|27'),
 Row(col_explode='A|BE|1|25'),
 Row(col_explode='B|BSC|2|27'),
 Row(col_explode='A|BE|1|25'),
 Row(col_explode='B|BSC|2|27'),
 Row(col_explode='A|BE|1|25'),
 Row(col_explode='B|BSC|2|27'),
 Row(col_explode='A|BE|1|25'),
 Row(col_explode='B|BSC|2|27')]

In [None]:
result.rdd.map(lambda i:len(i)).collect()


In [None]:
result_rdd = result.rdd.map(lambda i:i[0].split('|'))

result = result_rdd.toDF(['Name','Qualification','S.no','Age']).show()

Read files recursively under Folders

In [17]:
from pyspark.sql.functions import input_file_name

df1 = spark.read.format('csv').option('header',True).load('sample_data/recursive').withColumn('filepath',input_file_name())

df1.show(truncate=False)

paths = [
    'sample_data/recursive',
    'sample_data/recursive/level1'
]

df1 = spark.read.format('csv').option('header',True).load(paths).withColumn('filepath',input_file_name())

df1.show(truncate=False)

df1 = spark.read.format('csv').option('header',True).option('recursiveFileLookup',True).load(paths).withColumn('filepath',input_file_name())

df1.show(truncate=False)

+--------+---+--------------------------------------------------------------+
|Name    |Age|filepath                                                      |
+--------+---+--------------------------------------------------------------+
|Monisha |23 |file:///c:/Users/varsh/pyspark/sample_data/recursive/file1.csv|
|Arvind  |24 |file:///c:/Users/varsh/pyspark/sample_data/recursive/file1.csv|
|Rishitha|24 |file:///c:/Users/varsh/pyspark/sample_data/recursive/file1.csv|
|Anusha  |24 |file:///c:/Users/varsh/pyspark/sample_data/recursive/file1.csv|
|Gayatri |25 |file:///c:/Users/varsh/pyspark/sample_data/recursive/file1.csv|
+--------+---+--------------------------------------------------------------+

+--------+---+---------------------------------------------------------------------+
|Name    |Age|filepath                                                             |
+--------+---+---------------------------------------------------------------------+
|Monisha |23 |file:///c:/Users/varsh/pyspa

Which one returns results faster ?

In [None]:
#1
df = spark.read.csv()
df = df.filter()
df = df.sort()
df.count()
#2
df = spark.read.csv()
df = df.sort()
df = df.filter()
df.count()
#3
df = spark.read.csv()
df.sort()
df = df.filter()
df = df.sort()
df.count()
#4
df = spark.read.csv()
df.sort()
df = df.sort()
df = df.filter()
df.count()

# Predicate pushdown pushes the filter to the start even if sort is given first

# option A performs better, Caching took time, this is not the right case for doing caching

Inferschema Vs Schema definition

Always define schema manually since inferschema scans the table twice

Data skew

In [19]:
bank = spark.read.format('csv').option('header',True).option('inferSchema',True).load('sample_data/BankChurners.csv')

bank.show()

bank.rdd.getNumPartitions()

+---------+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+--------------+-------------------+---------------------+
|CLIENTNUM|   Attrition_Flag|Customer_Age|Gender|Dependent_count|Education_Level|Marital_Status|Income_Category|Card_Category|Months_on_book|Total_Relationship_Count|Months_Inactive_12_mon|Contacts_Count_12_mon|Credit_Limit|Total_Revolving_Bal|Avg_Open_To_Buy|Total_Amt_Chng_Q4_Q1|Total_Trans_Amt|Total_Trans_Ct|Total_Ct_Chng_Q4_Q1|Avg_Utilization_Ratio|
+---------+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------

1

In [21]:
input = bank.repartition(4)
input.rdd.getNumPartitions()

4

In [None]:
from pyspark.sql.functions import spark_partition_id

input = input.withColumn('partition_id',spark_partition_id())
input.show()

countByPartition = input.groupBy(input.partition_id).count()
countByPartition.show()

# Data is evenly distributed

+---------+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+--------------+-------------------+---------------------+------------+
|CLIENTNUM|   Attrition_Flag|Customer_Age|Gender|Dependent_count|Education_Level|Marital_Status|Income_Category|Card_Category|Months_on_book|Total_Relationship_Count|Months_Inactive_12_mon|Contacts_Count_12_mon|Credit_Limit|Total_Revolving_Bal|Avg_Open_To_Buy|Total_Amt_Chng_Q4_Q1|Total_Trans_Amt|Total_Trans_Ct|Total_Ct_Chng_Q4_Q1|Avg_Utilization_Ratio|partition_id|
+---------+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+----

In [29]:
input = bank.repartition(200,"Card_Category")
input.rdd.getNumPartitions()

200

In [30]:
input = input.withColumn('partition_id',spark_partition_id())
input.show()

countByPartition = input.groupBy(input.partition_id).count()
countByPartition.show()

+---------+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+--------------+-------------------+---------------------+------------+
|CLIENTNUM|   Attrition_Flag|Customer_Age|Gender|Dependent_count|Education_Level|Marital_Status|Income_Category|Card_Category|Months_on_book|Total_Relationship_Count|Months_Inactive_12_mon|Contacts_Count_12_mon|Credit_Limit|Total_Revolving_Bal|Avg_Open_To_Buy|Total_Amt_Chng_Q4_Q1|Total_Trans_Amt|Total_Trans_Ct|Total_Ct_Chng_Q4_Q1|Avg_Utilization_Ratio|partition_id|
+---------+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+----

Calculate expriy date 

In [34]:
exp_df = spark.read.format('csv').option('header',True).option('inferSchema',True).load('sample_data/expiry.csv')

exp_df.show()
print(exp_df.schema)

+---+------------+--------+
| id|rechargeDate|validity|
+---+------------+--------+
|  1|    20200511|      20|
|  2|    20200119|      13|
|  3|    20200405|     120|
+---+------------+--------+

StructType([StructField('id', IntegerType(), True), StructField('rechargeDate', IntegerType(), True), StructField('validity', IntegerType(), True)])


In [40]:
from pyspark.sql.functions import date_add,to_date,col,expr

exp_df = exp_df.withColumn('date',to_date(col('rechargeDate').cast('string'),"yyyyMMdd"))
exp_df.show()

exp_df = exp_df.withColumn('expiryDate',date_add('date','validity'))
exp_df.show()

+---+------------+--------+----------+
| id|rechargeDate|validity|      date|
+---+------------+--------+----------+
|  1|    20200511|      20|2020-05-11|
|  2|    20200119|      13|2020-01-19|
|  3|    20200405|     120|2020-04-05|
+---+------------+--------+----------+

+---+------------+--------+----------+----------+
| id|rechargeDate|validity|      date|expiryDate|
+---+------------+--------+----------+----------+
|  1|    20200511|      20|2020-05-11|2020-05-31|
|  2|    20200119|      13|2020-01-19|2020-02-01|
|  3|    20200405|     120|2020-04-05|2020-08-03|
+---+------------+--------+----------+----------+



Merge 2 complex dataframes 

In [51]:
json1 = spark.read.json('sample_data/input1.json',multiLine=True)
json2 = spark.read.json('sample_data/input2.json',multiLine=True)


json1.printSchema()
json1.show()
json1.columns

root
 |-- Education: struct (nullable = true)
 |    |-- Age: long (nullable = true)
 |    |-- Qualification: string (nullable = true)
 |    |-- Year: long (nullable = true)
 |-- Name: string (nullable = true)

+----------------+----+
|       Education|Name|
+----------------+----+
|{28, BCOM, 2013}|Azar|
|  {24, BE, 2021}|Amol|
+----------------+----+



['Education', 'Name']

In [64]:
json1.schema

StructType([StructField('Education', StructType([StructField('Age', LongType(), True), StructField('Qualification', StringType(), True), StructField('Year', LongType(), True)]), True), StructField('Name', StringType(), True)])

In [52]:
from pyspark.sql.functions import struct, lit, col
from pyspark.sql.types import *

def flatten_struct(schema, prefix=""):
    result = []
    for elem in schema:
        if isinstance(elem.dataType,StructType):
            result+=flatten_struct(elem.dataType, prefix+elem.name+".")
        else:
            result.append(col(prefix+elem.name).alias(prefix+elem.name))
    return result

l1 = flatten_struct(json1.schema)
l2 = flatten_struct(json2.schema)

In [74]:
col1 = []
col2 = []
for i in l1:
    col1.append(str(i).replace("AS","'").split("'")[1].strip())
for i in l2:
    col2.append(str(i).replace("AS","'").split("'")[1].strip())

In [97]:
print(col1)
print(col2)
diff = set(col1)-set(col2)
print(diff)

for i in diff:
    if('.' in i):
        c,cn = i.split(".")
        s_type = json1.schema[c].dataType[cn].dataType
        print(s_type)
        s_fields = json2.schema[c].dataType.names
        print(s_fields)
        inDf = json2.withColumn(c,struct(*([col(c)[record].alias(record) for record in s_fields] + [lit(None).cast(s_type).alias(cn)])))
        s_fields = sorted(inDf.schema[c].dataType.names)
        inDf.show()
        inDf = inDf.withColumn(c,struct(*([col(c)[record].alias(record) for record in s_fields])))
        inDf.show()

['Education.Age', 'Education.Qualification', 'Education.Year', 'Name']
['Education.Qualification', 'Education.Year', 'Name']
{'Education.Age'}
LongType()
['Qualification', 'Year']
+-----------------+------+
|        Education|  Name|
+-----------------+------+
|{BSC, 2012, null}|Benita|
|{MSC, 2022, null}| Bavya|
+-----------------+------+

+-----------------+------+
|        Education|  Name|
+-----------------+------+
|{null, BSC, 2012}|Benita|
|{null, MSC, 2022}| Bavya|
+-----------------+------+



In [98]:
inDf.printSchema()

root
 |-- Education: struct (nullable = false)
 |    |-- Age: long (nullable = true)
 |    |-- Qualification: string (nullable = true)
 |    |-- Year: long (nullable = true)
 |-- Name: string (nullable = true)



In [100]:
json1.printSchema()
outputDf = json1.union(inDf)
outputDf.show()

root
 |-- Education: struct (nullable = true)
 |    |-- Age: long (nullable = true)
 |    |-- Qualification: string (nullable = true)
 |    |-- Year: long (nullable = true)
 |-- Name: string (nullable = true)

+-----------------+------+
|        Education|  Name|
+-----------------+------+
| {28, BCOM, 2013}|  Azar|
|   {24, BE, 2021}|  Amol|
|{null, BSC, 2012}|Benita|
|{null, MSC, 2022}| Bavya|
+-----------------+------+



Speculative Execution in Spark

Speculative execution in Apache Spark is a performance optimization technique that aims to handle slow or straggling tasks during a job execution. In distributed systems like Spark, tasks are run in parallel across multiple workers. However, due to various reasons (e.g., resource contention, network latency, node failures), some tasks may take much longer to complete than others. These slow tasks are known as "stragglers."

Speculative execution helps mitigate the impact of these straggler tasks by launching duplicate copies of the slow tasks on other nodes. The first copy that finishes is considered the result, and the others are discarded. This helps reduce the overall job completion time.

In [None]:
spark.conf('spark.speculation',True) # Enables or disables speculative execution (default is false).
spark.conf('spark.speculation.interval',200) # The frequency at which Spark checks for straggler tasks.
spark.conf('spark.speculation.multiplier',5) # threshold = median_task_duration * spark.speculation.multiplier
# Where:

# median_task_duration is the time it takes for most tasks to finish in the same stage.

# spark.speculation.multiplier is the factor that defines how much longer a task can take before it is considered a straggler.

# If you have spark.speculation.multiplier = 1.5 and the median time for tasks in a stage is 10 seconds, the threshold for considering a task as a straggler would be:

# threshold = 10 * 1.5 = 15 seconds
# In this case, if any task takes longer than 15 seconds to complete, Spark will consider it a candidate for speculative execution and will launch a duplicate copy of the task on another executor

spark.conf('spark.speculation.quantile',0.75) # The quantile of tasks that are considered for speculative execution (e.g., if you set this to 0.75, Spark will speculate on the slowest 25% of tasks)

How to handle corrupt/bad records

Modes in spark.read()
1. PERMISSIVE
2. FAILFAST
3. DROPMALFORMED

In [102]:
df = spark.read.csv('sample_data/badRecords.csv',header=True,inferSchema=True)
df.show()

+-------+--------+----------+
| emp_no|emp_name|department|
+-------+--------+----------+
|      1| Murugan|        IT|
|invalid| invalid|      null|
|      2|  Kannan|   Finance|
|      3|   Mohan|      null|
|      4|   Pavan|        HR|
+-------+--------+----------+



In [103]:
df = spark.read.option('mode','FAILFAST').csv('sample_data/badRecords.csv',header=True,inferSchema=True)
df.show()

Py4JJavaError: An error occurred while calling o659.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 123.0 failed 1 times, most recent failure: Lost task 0.0 in stage 123.0 (TID 432) (localhost executor driver): org.apache.spark.SparkException: Malformed records are detected in record parsing. Parse Mode: FAILFAST. To process malformed records as null result, try setting the option 'mode' as 'PERMISSIVE'.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.malformedRecordsDetectedInRecordParsingError(QueryExecutionErrors.scala:1417)
	at org.apache.spark.sql.catalyst.util.FailureSafeParser.parse(FailureSafeParser.scala:68)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser$.$anonfun$parseIterator$2(UnivocityParser.scala:421)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:364)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:890)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:890)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.sql.catalyst.util.BadRecordException: java.lang.RuntimeException: Malformed CSV record
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.org$apache$spark$sql$catalyst$csv$UnivocityParser$$convert(UnivocityParser.scala:330)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$parse$2(UnivocityParser.scala:275)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser$.$anonfun$parseIterator$1(UnivocityParser.scala:417)
	at org.apache.spark.sql.catalyst.util.FailureSafeParser.parse(FailureSafeParser.scala:60)
	... 20 more
Caused by: java.lang.RuntimeException: Malformed CSV record
	at org.apache.spark.sql.errors.QueryExecutionErrors$.malformedCSVRecordError(QueryExecutionErrors.scala:1222)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.org$apache$spark$sql$catalyst$csv$UnivocityParser$$convert(UnivocityParser.scala:298)
	... 23 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2228)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2249)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2268)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:506)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:459)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:48)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3868)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2863)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:3858)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:510)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3856)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3856)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2863)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3084)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:288)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:327)
	at sun.reflect.GeneratedMethodAccessor49.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Malformed records are detected in record parsing. Parse Mode: FAILFAST. To process malformed records as null result, try setting the option 'mode' as 'PERMISSIVE'.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.malformedRecordsDetectedInRecordParsingError(QueryExecutionErrors.scala:1417)
	at org.apache.spark.sql.catalyst.util.FailureSafeParser.parse(FailureSafeParser.scala:68)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser$.$anonfun$parseIterator$2(UnivocityParser.scala:421)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:364)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:890)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:890)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more
Caused by: org.apache.spark.sql.catalyst.util.BadRecordException: java.lang.RuntimeException: Malformed CSV record
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.org$apache$spark$sql$catalyst$csv$UnivocityParser$$convert(UnivocityParser.scala:330)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$parse$2(UnivocityParser.scala:275)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser$.$anonfun$parseIterator$1(UnivocityParser.scala:417)
	at org.apache.spark.sql.catalyst.util.FailureSafeParser.parse(FailureSafeParser.scala:60)
	... 20 more
Caused by: java.lang.RuntimeException: Malformed CSV record
	at org.apache.spark.sql.errors.QueryExecutionErrors$.malformedCSVRecordError(QueryExecutionErrors.scala:1222)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.org$apache$spark$sql$catalyst$csv$UnivocityParser$$convert(UnivocityParser.scala:298)
	... 23 more


In [None]:
df = spark.read.option('mode','DROPMALFORMED').csv('sample_data/badRecords.csv',header=True,inferSchema=True)
df.show()

+------+--------+----------+
|emp_no|emp_name|department|
+------+--------+----------+
|     1| Murugan|        IT|
|     2|  Kannan|   Finance|
|     4|   Pavan|        HR|
+------+--------+----------+

root
 |-- emp_no: string (nullable = true)
 |-- emp_name: string (nullable = true)
 |-- department: string (nullable = true)



In [None]:
schema = StructType(
    [
        StructField('emp_no',IntegerType()),
        StructField('emp_name',StringType(),True),
        StructField('department',StringType(),True)
    ]
)
df = spark.read.option('mode','DROPMALFORMED').schema(schema=schema).csv('sample_data/badRecords.csv',header=True)
df.show()

+------+--------+----------+
|emp_no|emp_name|department|
+------+--------+----------+
|     1| Murugan|        IT|
|     2|  Kannan|   Finance|
|     3|   Mohan|      null|
|     4|   Pavan|        HR|
+------+--------+----------+



In [110]:
schema = StructType(
    [
        StructField('emp_no',IntegerType()),
        StructField('emp_name',StringType(),True),
        StructField('department',StringType(),True)
    ]
)
df = spark.read.schema(schema=schema).csv('sample_data/badRecords.csv',header=True)
df.show()

+------+--------+----------+
|emp_no|emp_name|department|
+------+--------+----------+
|     1| Murugan|        IT|
|  null| invalid|      null|
|     2|  Kannan|   Finance|
|     3|   Mohan|      null|
|     4|   Pavan|        HR|
+------+--------+----------+



Multi Delimiter