In [0]:

from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType


# Sample data with various types and NULL values
data = [
    ('  apple  ', '  dog  ', None, 123),
    ('  banana  ', None, '  blue  ', None),
    (None, '  mouse  ', '  green  ', 789),
    ('  grape  ', '  horse  ', '  yellow  ', 456),
    ('  melon  ', '  cat  ', '  orange  ', None)
]

# Column names
columns = ['Fruits', 'Animals', 'Colour', 'ID_Num']

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Display original DataFrame
print("Original DataFrame:")
df.show()



Original DataFrame:
+----------+---------+----------+------+
|    Fruits|  Animals|    Colour|ID_Num|
+----------+---------+----------+------+
|   apple  |    dog  |      null|   123|
|  banana  |     null|    blue  |  null|
|      null|  mouse  |   green  |   789|
|   grape  |  horse  |  yellow  |   456|
|   melon  |    cat  |  orange  |  null|
+----------+---------+----------+------+



In [0]:
# Define the trimming function
def Replace(value):
    if value is None:
        return 'default_value'
    elif isinstance(value, str):
        return value.strip()
    else:
        return str(value).strip()

# Register the UDF
trim_udf = udf(Replace, StringType())

# Apply the UDF to each column using a for loop
for col_name in df.columns:
     df = df.withColumn(col_name, trim_udf(col(col_name)))

# Display trimmed DataFrame
print("\nTrimmed DataFrame:")
df.show()


Trimmed DataFrame:
+-------------+-------------+-------------+-------------+
|       Fruits|      Animals|       Colour|       ID_Num|
+-------------+-------------+-------------+-------------+
|        apple|          dog|default_value|          123|
|       banana|default_value|         blue|default_value|
|default_value|        mouse|        green|          789|
|        grape|        horse|       yellow|          456|
|        melon|          cat|       orange|default_value|
+-------------+-------------+-------------+-------------+



In [0]:
display(df)

Fruits,Animals,Colour,ID_Num
apple,dog,default_value,123
banana,default_value,blue,default_value
default_value,mouse,green,789
grape,horse,yellow,456
melon,cat,orange,default_value


In [0]:
# Eplanation:
# Import PySpark libraries: Import necessary functions and types from pyspark.sql.
#Sample data: Define sample data that includes various data types and None values, and define column names.
#Create DataFrame: Create a PySpark DataFrame using the sample data.
#Display original DataFrame: Use the show() method to display the original DataFrame.
#Define the trimming function: Create a function trim_and_handle_none that trims whitespace from string values, replaces None with a default value, and converts non-string values to strings and trims them.
#Register the UDF: Register the trimming function as a UDF using udf and specify the return type as StringType.
#Apply the UDF to each column: Use a for loop and withColumn to apply the UDF to each column in the DataFrame.
#Display trimmed DataFrame: Use the show() method to display the trimmed DataFrame.