In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
spark = SparkSession.builder\
.appName('udf')\
.getOrCreate()

In [3]:
data = [
    ("U1", 85),
    ("U2", 72),
    ("U3", 40)
]

df = spark.createDataFrame(data, ["user_id", "score"])

udf way

In [12]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType
def classify(score):
  if score>=80:
    return "High"
  elif score>=50:
    return "Medium"
  else:
    return "Low"
performance_udf = udf(classify, StringType())
df =df.withColumn("Performance", performance_udf(col("score")))
df.show()

+-------+-----+-----------+
|user_id|score|Performance|
+-------+-----+-----------+
|     U1|   85|       High|
|     U2|   72|     Medium|
|     U3|   40|        Low|
+-------+-----+-----------+



pyspark way

In [13]:
from pyspark.sql.functions import when, col
df.withColumn("Performance",
              when(col("score")>=80, "High")
              .when(col("score")>=50, "Medium")
              .otherwise("Low")).show()

+-------+-----+-----------+
|user_id|score|Performance|
+-------+-----+-----------+
|     U1|   85|       High|
|     U2|   72|     Medium|
|     U3|   40|        Low|
+-------+-----+-----------+



In [14]:
df.withColumn("Performance", performance_udf("score")).explain(True)

== Parsed Logical Plan ==
'Project [unresolvedstarwithcolumns(Performance, classify('score)#80, None)]
+- Project [user_id#0, score#1L, classify(score#1L)#56 AS Performance#57]
   +- LogicalRDD [user_id#0, score#1L], false

== Analyzed Logical Plan ==
user_id: string, score: bigint, Performance: string
Project [user_id#0, score#1L, classify(score#1L)#80 AS Performance#81]
+- Project [user_id#0, score#1L, classify(score#1L)#56 AS Performance#57]
   +- LogicalRDD [user_id#0, score#1L], false

== Optimized Logical Plan ==
Project [user_id#0, score#1L, pythonUDF0#82 AS Performance#81]
+- BatchEvalPython [classify(score#1L)#80], [pythonUDF0#82]
   +- LogicalRDD [user_id#0, score#1L], false

== Physical Plan ==
*(2) Project [user_id#0, score#1L, pythonUDF0#82 AS Performance#81]
+- BatchEvalPython [classify(score#1L)#80], [pythonUDF0#82]
   +- *(1) Scan ExistingRDD[user_id#0,score#1L]



In [8]:
df.orderBy("score").show()

+-------+-----+
|user_id|score|
+-------+-----+
|     U3|   40|
|     U2|   72|
|     U1|   85|
+-------+-----+



In [9]:
df.orderBy(df.score.desc()).show()

+-------+-----+
|user_id|score|
+-------+-----+
|     U1|   85|
|     U2|   72|
|     U3|   40|
+-------+-----+



In [15]:
df.orderBy("performance", df.score.desc()).show()

+-------+-----+-----------+
|user_id|score|Performance|
+-------+-----+-----------+
|     U1|   85|       High|
|     U3|   40|        Low|
|     U2|   72|     Medium|
+-------+-----+-----------+



Set operations

In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder\
.appName('set_operations')\
.getOrCreate()

In [17]:
data_a = [
    ("U1", "Python"),
    ("U2", "Java"),
    ("U3", "Spark")
]
df_a = spark.createDataFrame(data_a, ["user_id", "course"])

In [18]:
data_b = [
    ("U2", "Java"),
    ("U3", "Spark"),
    ("U4", "Python")]
df_b = spark.createDataFrame(data_b, ["user_id", "course"])

In [19]:
df_a.union(df_b).show()

+-------+------+
|user_id|course|
+-------+------+
|     U1|Python|
|     U2|  Java|
|     U3| Spark|
|     U2|  Java|
|     U3| Spark|
|     U4|Python|
+-------+------+



In [20]:
df_a.intersect(df_b).show()

+-------+------+
|user_id|course|
+-------+------+
|     U3| Spark|
|     U2|  Java|
+-------+------+

