In [0]:
#
#Handling whitespace issues is important when reading CSV files into Spark DataFrames, as leading and trailing whitespaces in data can lead to inaccuracies and misinterpretations.

 #Using ignoreLeadingWhiteSpace and ignoreTrailingWhiteSpace Options
#The ignoreLeadingWhiteSpace and ignoreTrailingWhiteSpace options can be used to trim leading and trailing whitespaces from fields.

# # Initialize SparkSession
spark = SparkSession.builder.appName("HandleWhitespace").getOrCreate()

# Read CSV file with whitespace handling options
df = spark.read.csv("path/to/csvfile.csv",
                    header=True,
                    ignoreLeadingWhiteSpace=True,  # Ignore leading whitespaces
                    ignoreTrailingWhiteSpace=True,  # Ignore trailing whitespaces
                    inferSchema=True)

                    



In [0]:
# Using trim Method on DataFrame Columns

#You can manually trim whitespaces from specific columns using the trim method from pyspark.sql.functions.

## Read CSV file without whitespace handling options
df = spark.read.csv("path/to/csvfile.csv", header=True, inferSchema=True)

# Manually trim whitespaces from specific columns
df_trimmed = df.withColumn("name", trim(df["name"])) \
               .withColumn("email", trim(df["email"]))

               #In this example, the name and email columns are manually trimmed of leading and trailing whitespaces.

#  Handling Whitespaces in Column Names

# Read CSV file with header
df = spark.read.csv("path/to/csvfile.csv", header=True, inferSchema=True)

# Remove leading/trailing whitespaces from column names
df = df.toDF(*[col.strip() for col in df.columns])

df.show()

In [0]:
#Using the options Method for Multiple Configurations
You can chain multiple configurations using the options method for a cleaner approach.

Example:
python
Copy code
# Read CSV file with multiple configurations using options
df = spark.read.options(
    header=True,
    ignoreLeadingWhiteSpace=True,
    ignoreTrailingWhiteSpace=True,
    inferSchema=True
).csv("path/to/csvfile.csv")

df.show()
This example uses the options method to set multiple configurations at once.