ADDING NEW COLUMN TO THE DATAFRAME

In [28]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,lit

In [23]:
data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]


In [24]:
columns=["firstname","middlename","lastname","dob","gender","salary"]
spark=SparkSession.builder.appName("AddingColumns.com").getOrCreate()
df3=spark.createDataFrame(data=data, schema=columns)

changing the datatype of a column 

In [25]:
df3.withColumn("salary",col("salary").cast("Integer")).show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



To update the value of an existing column

In [26]:
df3.withColumn("salary",col("salary")*100).show()
# df3.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|300000|
|  Michael|      Rose|        |2000-05-19|     M|400000|
|   Robert|          |Williams|1978-09-05|     M|400000|
|    Maria|      Anne|   Jones|1967-12-01|     F|400000|
|      Jen|      Mary|   Brown|1980-02-17|     F|  -100|
+---------+----------+--------+----------+------+------+



To create a column from an existing column

In [27]:
df3.withColumn("newcolumn",col('salary')* -1).show()

+---------+----------+--------+----------+------+------+---------+
|firstname|middlename|lastname|       dob|gender|salary|newcolumn|
+---------+----------+--------+----------+------+------+---------+
|    James|          |   Smith|1991-04-01|     M|  3000|    -3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|    -4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|    -4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|    -4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|        1|
+---------+----------+--------+----------+------+------+---------+



To add a new column using lit fuction
lit function is used to create the constant value for all the rows.

In [29]:
df3.withColumn("country",lit("USA")).show()

+---------+----------+--------+----------+------+------+-------+
|firstname|middlename|lastname|       dob|gender|salary|country|
+---------+----------+--------+----------+------+------+-------+
|    James|          |   Smith|1991-04-01|     M|  3000|    USA|
|  Michael|      Rose|        |2000-05-19|     M|  4000|    USA|
|   Robert|          |Williams|1978-09-05|     M|  4000|    USA|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|    USA|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|    USA|
+---------+----------+--------+----------+------+------+-------+



In [30]:
df3.withColumn("country",lit("USA")) \
    .withColumn("anotherColumn",lit("anothervalue")) \
    .show()

+---------+----------+--------+----------+------+------+-------+-------------+
|firstname|middlename|lastname|       dob|gender|salary|country|anotherColumn|
+---------+----------+--------+----------+------+------+-------+-------------+
|    James|          |   Smith|1991-04-01|     M|  3000|    USA| anothervalue|
|  Michael|      Rose|        |2000-05-19|     M|  4000|    USA| anothervalue|
|   Robert|          |Williams|1978-09-05|     M|  4000|    USA| anothervalue|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|    USA| anothervalue|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|    USA| anothervalue|
+---------+----------+--------+----------+------+------+-------+-------------+



In [31]:
df3.withColumnRenamed("gender","sex").show()

+---------+----------+--------+----------+---+------+
|firstname|middlename|lastname|       dob|sex|salary|
+---------+----------+--------+----------+---+------+
|    James|          |   Smith|1991-04-01|  M|  3000|
|  Michael|      Rose|        |2000-05-19|  M|  4000|
|   Robert|          |Williams|1978-09-05|  M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|  F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|  F|    -1|
+---------+----------+--------+----------+---+------+



To drop the column from the pyspark from a dataframe

In [35]:
df3.drop("salary").show()


+---------+----------+--------+----------+------+
|firstname|middlename|lastname|       dob|gender|
+---------+----------+--------+----------+------+
|    James|          |   Smith|1991-04-01|     M|
|  Michael|      Rose|        |2000-05-19|     M|
|   Robert|          |Williams|1978-09-05|     M|
|    Maria|      Anne|   Jones|1967-12-01|     F|
|      Jen|      Mary|   Brown|1980-02-17|     F|
+---------+----------+--------+----------+------+

