- UDFs can be less efficient than built-in functions due to serialization and deserialization overhead between the JVM and Python. For better performance, consider using built-in functions or pandas UDFs when possible.

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

In [0]:
def to_upper_case(text):
    return text.upper()


upper_case_udf = udf(to_upper_case, StringType())

In [0]:
data = [("Alice",), ("Bob",), ("Charlie",)]
df = spark.createDataFrame(data, ["name"])

df = df.withColumn("upper_name", upper_case_udf(df["name"]))
df.show()

+-------+----------+
|   name|upper_name|
+-------+----------+
|  Alice|     ALICE|
|    Bob|       BOB|
|Charlie|   CHARLIE|
+-------+----------+

