# WITHCOLUMN() + WITHCOLUMNRENAMED()

In [16]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit

spark = (
    SparkSession.builder
    .appName("example-04")
    .getOrCreate()
)

In [13]:
data = [('James','','Smith','1991-04-01','M',3000),
        ('Michael','Rose','','2000-05-19','M',4000),
        ('Robert','','Williams','1978-09-05','M',4000),
        ('Maria','Anne','Jones','1967-12-01','F',4000),
        ('Jen','Mary','Brown','1980-02-17','F',-1)]

columns = ["firstname", "middlename", "lastname", "dob", "gender", "salary"]

df = spark.createDataFrame(data=data, schema = columns)

In [14]:
# Change DataType using PySpark withColumn()
df.withColumn("salary",col("salary").cast("Integer")).show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [15]:
# Update The Value of an Existing Column
df.withColumn("salary", col("salary") * 10).show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M| 30000|
|  Michael|      Rose|        |2000-05-19|     M| 40000|
|   Robert|          |Williams|1978-09-05|     M| 40000|
|    Maria|      Anne|   Jones|1967-12-01|     F| 40000|
|      Jen|      Mary|   Brown|1980-02-17|     F|   -10|
+---------+----------+--------+----------+------+------+



In [17]:
# Update The Value of an Existing Column - squared value of column salary
df.withColumn("new_salary", pow(col("salary"), lit(2))).show()

+---------+----------+--------+----------+------+------+----------+
|firstname|middlename|lastname|       dob|gender|salary|new_salary|
+---------+----------+--------+----------+------+------+----------+
|    James|          |   Smith|1991-04-01|     M|  3000| 9000000.0|
|  Michael|      Rose|        |2000-05-19|     M|  4000|     1.6E7|
|   Robert|          |Williams|1978-09-05|     M|  4000|     1.6E7|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|     1.6E7|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|       1.0|
+---------+----------+--------+----------+------+------+----------+



In [19]:
# Rename Column Name
df.withColumnRenamed("gender","sex").show(truncate=False) 

+---------+----------+--------+----------+---+------+
|firstname|middlename|lastname|dob       |sex|salary|
+---------+----------+--------+----------+---+------+
|James    |          |Smith   |1991-04-01|M  |3000  |
|Michael  |Rose      |        |2000-05-19|M  |4000  |
|Robert   |          |Williams|1978-09-05|M  |4000  |
|Maria    |Anne      |Jones   |1967-12-01|F  |4000  |
|Jen      |Mary      |Brown   |1980-02-17|F  |-1    |
+---------+----------+--------+----------+---+------+



In [20]:
# Drop Column From PySpark DataFrame
df.drop("new_salary").show() 

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+

