In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Step 1: Initialize a Spark session
spark = SparkSession.builder.appName("PySpark UDF Example").getOrCreate()

# Step 2: Create an RDD from the data
data = [("1", "john jones"), ("2", "tracey smith"), ("3", " amy sanders")]
columns = ["Seqno", "Name"]
rdd = spark.sparkContext.parallelize(data)
df = rdd.toDF(columns)

# Display the original DataFrame
df.show()

+-----+------------+
|Seqno|        Name|
+-----+------------+
|    1|  john jones|
|    2|tracey smith|
|    3| amy sanders|
+-----+------------+



In [4]:
def capitalize_first_letter(text):
    if text:
        return " ".join(word.capitalize() for word in text.strip().split())
    return text
capitalize_udf = udf(capitalize_first_letter, StringType())

# Step 6: Apply the UDF to the DataFrame
df_with_capitalized_names = df.withColumn("Name", capitalize_udf(df["Name"]))

# Display the updated DataFrame
df_with_capitalized_names.show()

# Stop the Spark session
spark.stop()


+-----+------------+
|Seqno|        Name|
+-----+------------+
|    1|  John Jones|
|    2|Tracey Smith|
|    3| Amy Sanders|
+-----+------------+

