In [2]:
data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

columns = ["firstname","middlename","lastname","dob","gender","salary"]
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark_EX").getOrCreate()
df = spark.createDataFrame(data = data, schema = columns)
df.printSchema()
df.show()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



### Change DataType using PySpark withColumn()

In [6]:
from pyspark.sql.functions import *
df.withColumn("salary",col("salary").cast("integer")).printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



### Update The Value of an Existing Column
- Syntax<br>
DataFrame.withColumn(colName, col)

In [9]:

df.withColumn("Sal",col("salary")+100).show()

+---------+----------+--------+----------+------+------+----+
|firstname|middlename|lastname|       dob|gender|salary| Sal|
+---------+----------+--------+----------+------+------+----+
|    James|          |   Smith|1991-04-01|     M|  3000|3100|
|  Michael|      Rose|        |2000-05-19|     M|  4000|4100|
|   Robert|          |Williams|1978-09-05|     M|  4000|4100|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|4100|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|  99|
+---------+----------+--------+----------+------+------+----+



In [11]:
df2 = df.withColumn("salary",df.salary+300)
df2.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3300|
|  Michael|      Rose|        |2000-05-19|     M|  4300|
|   Robert|          |Williams|1978-09-05|     M|  4300|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4300|
|      Jen|      Mary|   Brown|1980-02-17|     F|   299|
+---------+----------+--------+----------+------+------+



In [19]:
df3 = df.withColumn("gender", when(df.gender =='M',"Male").when(df.gender == "F","Female").otherwise(df.gender))
df3.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|  Male|  3000|
|  Michael|      Rose|        |2000-05-19|  Male|  4000|
|   Robert|          |Williams|1978-09-05|  Male|  4000|
|    Maria|      Anne|   Jones|1967-12-01|Female|  4000|
|      Jen|      Mary|   Brown|1980-02-17|Female|    -1|
+---------+----------+--------+----------+------+------+

