In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("select_vs_withColumn").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/26 12:49:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


- sample dataframe

In [3]:
employee_data = [
    (111, "Stephen", "King", 2000),
    (222, "Philipp", "Larkin", 8000),
    (333, "John", "Smith", 6000),
]

employee_schema = ["id", "FirstName", "LastName", "Salary"]

df = spark.createDataFrame(data=employee_data, schema=employee_schema)
df.show(truncate=False)

                                                                                

+---+---------+--------+------+
|id |FirstName|LastName|Salary|
+---+---------+--------+------+
|111|Stephen  |King    |2000  |
|222|Philipp  |Larkin  |8000  |
|333|John     |Smith   |6000  |
+---+---------+--------+------+



- Add new column with multiple withColumn

In [4]:
from pyspark.sql.functions import col, concat, lit, current_timestamp

In [5]:
dfWithColumn = (
    df.withColumn("Name", concat(col("FirstName"), lit(" "), col("LastName")))
    .withColumn("BonusPercent", lit(10))
    .withColumn("TotalSalary", col("Salary") * col("BonusPercent"))
    .withColumn("DateCreated", current_timestamp())
)

dfWithColumn.show(truncate=False)

+---+---------+--------+------+--------------+------------+-----------+-------------------------+
|id |FirstName|LastName|Salary|Name          |BonusPercent|TotalSalary|DateCreated              |
+---+---------+--------+------+--------------+------------+-----------+-------------------------+
|111|Stephen  |King    |2000  |Stephen King  |10          |20000      |2024-02-26 12:57:15.43089|
|222|Philipp  |Larkin  |8000  |Philipp Larkin|10          |80000      |2024-02-26 12:57:15.43089|
|333|John     |Smith   |6000  |John Smith    |10          |60000      |2024-02-26 12:57:15.43089|
+---+---------+--------+------+--------------+------------+-----------+-------------------------+



- Add new columns with Multiple Dataframes

In [6]:
dfWithCol = df.withColumn("Name", concat(col("FirstName"), lit(" "), col("LastName")))
dfWithCol = dfWithCol.withColumn("BonusPercent", lit(10))
dfWithCol = dfWithCol.withColumn("TotalSalary", col("Salary") * col("BonusPercent"))
dfWithCol = dfWithCol.withColumn("DateCreated", current_timestamp())

dfWithCol.show(truncate=False)

+---+---------+--------+------+--------------+------------+-----------+--------------------------+
|id |FirstName|LastName|Salary|Name          |BonusPercent|TotalSalary|DateCreated               |
+---+---------+--------+------+--------------+------------+-----------+--------------------------+
|111|Stephen  |King    |2000  |Stephen King  |10          |20000      |2024-02-26 13:00:23.526588|
|222|Philipp  |Larkin  |8000  |Philipp Larkin|10          |80000      |2024-02-26 13:00:23.526588|
|333|John     |Smith   |6000  |John Smith    |10          |60000      |2024-02-26 13:00:23.526588|
+---+---------+--------+------+--------------+------------+-----------+--------------------------+



#### Select vs withColumn

In [9]:
dfSelect = df.select(
    "*",
    concat(col("FirstName"), lit(" "), col("LastName")).alias("Name"),
    lit(10).alias("BonusPercent"),
    (col("Salary") * lit(10)).alias("TotalSalary"),
    current_timestamp().alias("DateCreated"),
)
dfSelect.show(truncate=False)

# select vs withColumn

dfWithColumn = (
    df.withColumn("Name", concat(col("FirstName"), lit(" "), col("LastName")))
    .withColumn("BonusPercent", lit(10))
    .withColumn("TotalSalary", col("Salary") * col("BonusPercent"))
    .withColumn("DateCreated", current_timestamp())
)

dfWithColumn.show(truncate=False)

+---+---------+--------+------+--------------+------------+-----------+--------------------------+
|id |FirstName|LastName|Salary|Name          |BonusPercent|TotalSalary|DateCreated               |
+---+---------+--------+------+--------------+------------+-----------+--------------------------+
|111|Stephen  |King    |2000  |Stephen King  |10          |20000      |2024-02-26 13:08:17.706758|
|222|Philipp  |Larkin  |8000  |Philipp Larkin|10          |80000      |2024-02-26 13:08:17.706758|
|333|John     |Smith   |6000  |John Smith    |10          |60000      |2024-02-26 13:08:17.706758|
+---+---------+--------+------+--------------+------------+-----------+--------------------------+

+---+---------+--------+------+--------------+------------+-----------+--------------------------+
|id |FirstName|LastName|Salary|Name          |BonusPercent|TotalSalary|DateCreated               |
+---+---------+--------+------+--------------+------------+-----------+--------------------------+
|111|Step

<h5> Notes</h5></br>
<h4> withColumn()</h4></br>
- This method introduces a projection internally. Therefore, calling it multiple times, for instance, via loops in order to add multiple columns can generate big plans which can cause performance issues and even StackOverflowException. To avoid this, use select() with the multiple columns at once.