In [43]:
# [+] SparkSession 설정
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('udf').getOrCreate()

In [44]:
# 샘플 데이터: 무작위 이름과 나이
name_and_age = [
    ('정진하', 24),
    ('남궁광훈', 31),
    ('박춘자', 65),
    ('홍현숙', 45),
    ('봉희경', 35),
    ('탁시욱', 54),
    ('조태윤', 25)
]

In [45]:
# [+] 스키마 정의
schema =['name', 'age']

In [46]:
# [+] 데이터프레임 생성
df = spark.createDataFrame(data = name_and_age, schema = schema)

In [47]:
df.show()

+--------+---+
|    name|age|
+--------+---+
|  정진하| 24|
|남궁광훈| 31|
|  박춘자| 65|
|  홍현숙| 45|
|  봉희경| 35|
|  탁시욱| 54|
|  조태윤| 25|
+--------+---+



In [48]:
# [+] 데이터프레임 스키마 출력
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



In [49]:
# [+] Temporary View 생성
df.createOrReplaceTempView('nameage')

In [50]:
# Annotation 방식으로 UDF 등록하기
from pyspark.sql.functions import udf

In [51]:
# UDF : 나이별 범주 생성 함수
def age_category(age):
    if age < 35:
        return 'young'
    elif 35 <= age <= 59:
        return 'adult'
    else:
        return 'senior'

In [61]:
# udf 등록
spark.udf.register("age_category", age_category)

<function __main__.age_category(age)>

In [62]:
# SQL문 처리
spark.sql("SELECT name, age_category(age) AS age_category FROM nameage").show()

+--------+------------+
|    name|age_category|
+--------+------------+
|  정진하|       young|
|남궁광훈|       young|
|  박춘자|      senior|
|  홍현숙|       adult|
|  봉희경|       adult|
|  탁시욱|       adult|
|  조태윤|       young|
+--------+------------+

