In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col

spark = (SparkSession.builder
         .appName("write-udfs")
         .master("spark://spark-master:7077")
         .config("spark.executor.memory", "512m")
         .getOrCreate())

spark.sparkContext.setLogLevel("ERROR")

In [2]:
df = (spark.read.format("json")
      .option("multiLine", "true")
      .load("../data/nobel_prizes.json"))

                                                                                

In [5]:
df_flattened = (
    df
    .withColumn("laureates",explode(col("laureates")))
    .select(col("category")
            ,col("year")
            ,col("overallMotivation")
            ,col("laureates.id")
            ,col("laureates.firstname")
            ,col("laureates.surname")
            ,col("laureates.share")
            ,col("laureates.motivation"))
    .filter(col("laureates.firstname").isNotNull() & col("laureates.surname").isNotNull()))

In [6]:
def concat(first_name, last_name):
    return first_name + " " + last_name

In [7]:
from pyspark.sql.functions import udf
concat_udf = udf(concat)

In [8]:
from pyspark.sql.types import StringType
concat_udf = udf(concat, StringType())

In [9]:
df_flattened = df_flattened.withColumn("full_name", concat_udf(df_flattened["firstname"], df_flattened["surname"]))

In [10]:
df_flattened.show()

[Stage 1:>                                                          (0 + 1) / 1]

+----------+----+--------------------+----+----------+-----------+-----+--------------------+-----------------+
|  category|year|   overallMotivation|  id| firstname|    surname|share|          motivation|        full_name|
+----------+----+--------------------+----+----------+-----------+-----+--------------------+-----------------+
| chemistry|2022|                null|1015|   Carolyn|   Bertozzi|    3|"for the developm...| Carolyn Bertozzi|
| chemistry|2022|                null|1016|    Morten|     Meldal|    3|"for the developm...|    Morten Meldal|
| chemistry|2022|                null| 743|     Barry|  Sharpless|    3|"for the developm...|  Barry Sharpless|
| economics|2022|                null|1021|       Ben|   Bernanke|    3|"for research on ...|     Ben Bernanke|
| economics|2022|                null|1022|   Douglas|    Diamond|    3|"for research on ...|  Douglas Diamond|
| economics|2022|                null|1023|    Philip|     Dybvig|    3|"for research on ...|    Philip 

                                                                                

### Using UDFs in Spark SQL

In [11]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

# Define a UDF
def square_udf(x):
    return x ** 2

# Register the UDF
spark.udf.register("square", square_udf, IntegerType())

# Create a DataFrame
df = spark.createDataFrame([(1,), (2,), (3,), (4,), (5,)], ["num"])

# Use the registered UDF in a SQL query
df.createOrReplaceTempView("numbers")
result = spark.sql("SELECT num, square(num) AS square_num FROM numbers")

# Show the result
result.show()

                                                                                

+---+----------+
|num|square_num|
+---+----------+
|  1|         1|
|  2|         4|
|  3|         9|
|  4|        16|
|  5|        25|
+---+----------+



In [12]:
spark.stop()