[PySpark withColumn()](https://sparkbyexamples.com/pyspark/pyspark-withcolumn/)

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .master("local[5]") \
        .appName("Pyspark_DataFrame_Columns") \
        .getOrCreate()

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/20 10:30:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Creating a DataFrame

In [3]:
data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

columns = ['firstname', 'middlename', 'lastname', 'dob', 'gender', 'salary']
df = spark.createDataFrame(data=data, schema=columns)

In [4]:
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [5]:
df.show() # Shows first 20 elements from dataFrame

                                                                                

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



Changing the datatype of column
1. withColumn()

In [6]:
from pyspark.sql.functions import col

df2 = df.withColumn("salary", col('salary').cast("Integer"))
df2.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



2. Update the existing column

In [7]:
df2 = df2.withColumn(colName="salary", col=col('salary')*100)
df2.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|300000|
|  Michael|      Rose|        |2000-05-19|     M|400000|
|   Robert|          |Williams|1978-09-05|     M|400000|
|    Maria|      Anne|   Jones|1967-12-01|     F|400000|
|      Jen|      Mary|   Brown|1980-02-17|     F|  -100|
+---------+----------+--------+----------+------+------+



3. Create a column from existing column

In [8]:
df3 = df2.withColumn(colName="CopiedSalary", col=col('salary') + 1)
df3.show()

+---------+----------+--------+----------+------+------+------------+
|firstname|middlename|lastname|       dob|gender|salary|CopiedSalary|
+---------+----------+--------+----------+------+------+------------+
|    James|          |   Smith|1991-04-01|     M|300000|      300001|
|  Michael|      Rose|        |2000-05-19|     M|400000|      400001|
|   Robert|          |Williams|1978-09-05|     M|400000|      400001|
|    Maria|      Anne|   Jones|1967-12-01|     F|400000|      400001|
|      Jen|      Mary|   Brown|1980-02-17|     F|  -100|         -99|
+---------+----------+--------+----------+------+------+------------+



4. Adding a new Column

In [9]:
from pyspark.sql.functions import lit

df4 = df2.withColumn("Country", lit("USA")) # chaining can also be done
df4.show()

+---------+----------+--------+----------+------+------+-------+
|firstname|middlename|lastname|       dob|gender|salary|Country|
+---------+----------+--------+----------+------+------+-------+
|    James|          |   Smith|1991-04-01|     M|300000|    USA|
|  Michael|      Rose|        |2000-05-19|     M|400000|    USA|
|   Robert|          |Williams|1978-09-05|     M|400000|    USA|
|    Maria|      Anne|   Jones|1967-12-01|     F|400000|    USA|
|      Jen|      Mary|   Brown|1980-02-17|     F|  -100|    USA|
+---------+----------+--------+----------+------+------+-------+



5. Renaming a column

In [10]:
df4 = df4.withColumnRenamed("dob", "DateOfBirth")
df4.show(truncate=True)

+---------+----------+--------+-----------+------+------+-------+
|firstname|middlename|lastname|DateOfBirth|gender|salary|Country|
+---------+----------+--------+-----------+------+------+-------+
|    James|          |   Smith| 1991-04-01|     M|300000|    USA|
|  Michael|      Rose|        | 2000-05-19|     M|400000|    USA|
|   Robert|          |Williams| 1978-09-05|     M|400000|    USA|
|    Maria|      Anne|   Jones| 1967-12-01|     F|400000|    USA|
|      Jen|      Mary|   Brown| 1980-02-17|     F|  -100|    USA|
+---------+----------+--------+-----------+------+------+-------+



6. Drop Column

In [11]:
df3 = df3.drop("CopiedSalary")
df3.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|300000|
|  Michael|      Rose|        |2000-05-19|     M|400000|
|   Robert|          |Williams|1978-09-05|     M|400000|
|    Maria|      Anne|   Jones|1967-12-01|     F|400000|
|      Jen|      Mary|   Brown|1980-02-17|     F|  -100|
+---------+----------+--------+----------+------+------+



In [12]:
spark.stop()