In [0]:
#Malformed records are rows that do not match the expected schema or structure, which can cause parsing errors.

# 1. Using the mode Option
The mode option allows you to specify how to handle malformed records. The available modes are:

#PERMISSIVE (default): Puts malformed records in a separate column.
#DROPMALFORMED: Drops malformed records.
#FAILFAST: Throws an exception when it encounters malformed records.
#Example:

# Initialize SparkSession
spark = SparkSession.builder.appName("HandleMalformedRecords").getOrCreate()

# Read CSV file with mode option
df = spark.read.csv("path/to/csvfile.csv",
                    header=True,
                    mode="DROPMALFORMED",
                    inferSchema=True)
                    

In [0]:
# PERMISSIVE Mode with Corrupt Column

#When using the PERMISSIVE mode, you can specify an extra column to store malformed records by using the columnNameOfCorruptRecord option.

# # Read CSV file in PERMISSIVE mode with corrupt record column
df = spark.read.csv("path/to/csvfile.csv",
                    header=True,
                    mode="PERMISSIVE",
                    columnNameOfCorruptRecord="corrupt_record",
                    inferSchema=True)

df.show(truncate=False)
#In this example, malformed records are stored in the corrupt_record column, allowing you to inspect and handle them later.

In [0]:
# Handling delimiters issues.

## Initialize SparkSession
spark = SparkSession.builder.appName("HandleDelimiter").getOrCreate()

# Define schema (if needed)
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("email", StringType(), True)
])

# Read CSV file with custom delimiter and quote character
df = spark.read.csv("path/to/csvfile.csv",
                    schema=schema,
                    header=True,
                    delimiter=";",  # Specify the custom delimiter
                    quote='"',     # Specify the quote character
                    escape="\\",   # Specify the escape character
                    inferSchema=True)


In [0]:
# Handling Multiline records issues.
#  Using the multiLine Option

#The multiLine option allows Spark to handle records that span multiple lines. When enabled, Spark reads the entire file as a single input and then parses it accordingly.

Example:# Initialize SparkSession
spark = SparkSession.builder.appName("HandleMultilineRecords").getOrCreate()

# Read CSV file with multiline option
df = spark.read.csv("path/to/csvfile.csv",
                    header=True,
                    multiLine=True,  # Enable multiline record handling
                    inferSchema=True)

                    #Combining multiLine with Other Options
When dealing with multiline records, it’s often necessary to combine the multiLine option with other options like quote, escape, delimiter, and header to ensure proper parsing.

# Handling Large Files Efficiently
Reading large CSV files with multiline records can be resource-intensive. To optimize performance, consider tuning Spark configurations and partitioning the data.

## Initialize SparkSession with tuned configurations
spark = SparkSession.builder \
    .appName("HandleMultilineRecords") \
    .config("spark.sql.files.maxPartitionBytes", "128MB") \ ---are tuned to optimize the large reading files
    .config("spark.sql.files.openCostInBytes", "4MB") \ --- are tuned to optimize large reading files
    .getOrCreate()

# Read CSV file with multiline option
df = spark.read.csv("path/to/csvfile.csv",
                    header=True,
                    multiLine=True,
                    inferSchema=True)

df.show(truncate=False)


#Example CSV File (path/to/csvfile.csv)

#id,name,description,email
1,John Doe,"John is a developer
who works on various projects.",john@example.com
2,Jane Smith,"Jane is a data scientist.
She specializes in machine learning.",jane@example.com

#In this example:

The CSV file contains multiline records in the description field.
The multiLine=True option allows Spark to read the entire record correctly, even though it spans multiple lines.
Custom delimiter and quote character settings ensure the data is parsed accurately.

