# PySpark withColumn() Usage with Examples

PySpark withColumn() is a transformation function of DataFrame which is used to change the value, convert the datatype of an existing column, create a new column, and many more.

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("spark_withColumn").getOrCreate()

In [0]:
data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',3500)
]

columns = ["firstname","middlename","lastname","dob","gender","salary"]

df = spark.createDataFrame(data,columns)
df.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|  3500|
+---------+----------+--------+----------+------+------+



### Change DataType using PySpark withColumn()

In [0]:
from pyspark.sql.functions import col
df.withColumn("salary",col("salary").cast("Integer")).show()
df.printSchema()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|  3500|
+---------+----------+--------+----------+------+------+

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



###Update The Value of an Existing Column

In [0]:
df.withColumn("salary",col("salary")+1000).show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  4000|
|  Michael|      Rose|        |2000-05-19|     M|  5000|
|   Robert|          |Williams|1978-09-05|     M|  5000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  5000|
|      Jen|      Mary|   Brown|1980-02-17|     F|  4500|
+---------+----------+--------+----------+------+------+



###Create a Column from an Existing

In [0]:
df2 = df.withColumn("bonus",col("salary")/10)
df2.show()
df2.printSchema()

+---------+----------+--------+----------+------+------+-----+
|firstname|middlename|lastname|       dob|gender|salary|bonus|
+---------+----------+--------+----------+------+------+-----+
|    James|          |   Smith|1991-04-01|     M|  3000|300.0|
|  Michael|      Rose|        |2000-05-19|     M|  4000|400.0|
|   Robert|          |Williams|1978-09-05|     M|  4000|400.0|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|400.0|
|      Jen|      Mary|   Brown|1980-02-17|     F|  3500|350.0|
+---------+----------+--------+----------+------+------+-----+

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- bonus: double (nullable = true)



###Add a New Column using withColumn()

In [0]:
from pyspark.sql.functions import lit
df3 = df2.withColumn("city",lit("pune"))
df3.show()

+---------+----------+--------+----------+------+------+-----+----+
|firstname|middlename|lastname|       dob|gender|salary|bonus|city|
+---------+----------+--------+----------+------+------+-----+----+
|    James|          |   Smith|1991-04-01|     M|  3000|300.0|pune|
|  Michael|      Rose|        |2000-05-19|     M|  4000|400.0|pune|
|   Robert|          |Williams|1978-09-05|     M|  4000|400.0|pune|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|400.0|pune|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1| -0.1|pune|
+---------+----------+--------+----------+------+------+-----+----+



### Rename Column Name

In [0]:
df3.withColumnRenamed("gender","Sex").show()

+---------+----------+--------+----------+---+------+-----+----+
|firstname|middlename|lastname|       dob|Sex|salary|bonus|city|
+---------+----------+--------+----------+---+------+-----+----+
|    James|          |   Smith|1991-04-01|  M|  3000|300.0|pune|
|  Michael|      Rose|        |2000-05-19|  M|  4000|400.0|pune|
|   Robert|          |Williams|1978-09-05|  M|  4000|400.0|pune|
|    Maria|      Anne|   Jones|1967-12-01|  F|  4000|400.0|pune|
|      Jen|      Mary|   Brown|1980-02-17|  F|    -1| -0.1|pune|
+---------+----------+--------+----------+---+------+-----+----+



###  Drop Column From PySpark DataFrame

In [0]:
df3.drop("bonus").show()

+---------+----------+--------+----------+------+------+----+
|firstname|middlename|lastname|       dob|gender|salary|city|
+---------+----------+--------+----------+------+------+----+
|    James|          |   Smith|1991-04-01|     M|  3000|pune|
|  Michael|      Rose|        |2000-05-19|     M|  4000|pune|
|   Robert|          |Williams|1978-09-05|     M|  4000|pune|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|pune|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|pune|
+---------+----------+--------+----------+------+------+----+



In [0]:
df3.show()

+---------+----------+--------+----------+------+------+-----+----+
|firstname|middlename|lastname|       dob|gender|salary|bonus|city|
+---------+----------+--------+----------+------+------+-----+----+
|    James|          |   Smith|1991-04-01|     M|  3000|300.0|pune|
|  Michael|      Rose|        |2000-05-19|     M|  4000|400.0|pune|
|   Robert|          |Williams|1978-09-05|     M|  4000|400.0|pune|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|400.0|pune|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1| -0.1|pune|
+---------+----------+--------+----------+------+------+-----+----+



In [0]:
from pyspark.sql.functions import when
df3.withColumn("sex",when(df["gender"] == 'M', "male").when(df["gender"] == 'F', "female").otherwise("Not Vailable")).show()

+---------+----------+--------+----------+------+------+-----+----+------+
|firstname|middlename|lastname|       dob|gender|salary|bonus|city|   sex|
+---------+----------+--------+----------+------+------+-----+----+------+
|    James|          |   Smith|1991-04-01|     M|  3000|300.0|pune|  male|
|  Michael|      Rose|        |2000-05-19|     M|  4000|400.0|pune|  male|
|   Robert|          |Williams|1978-09-05|     M|  4000|400.0|pune|  male|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|400.0|pune|female|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1| -0.1|pune|female|
+---------+----------+--------+----------+------+------+-----+----+------+

