In [1]:
data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

columns = ["firstname","middlename","lastname","dob","gender","salary"]
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
df = spark.createDataFrame(data=data, schema = columns)

In [2]:
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [3]:
df.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



## 1. Change DataType using PySpark withColumn()

In [7]:
from pyspark.sql.functions import *
df.withColumn("salary",col("salary").cast("String")).printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)



## 2. Create a Column from an Existing

In [8]:
df.withColumn("CopiedColumn",col("salary")* -1).show()

+---------+----------+--------+----------+------+------+------------+
|firstname|middlename|lastname|       dob|gender|salary|CopiedColumn|
+---------+----------+--------+----------+------+------+------------+
|    James|          |   Smith|1991-04-01|     M|  3000|       -3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|       -4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|       -4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|       -4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|           1|
+---------+----------+--------+----------+------+------+------------+



## 3.  Add Column Based on Another Column of DataFrame

In [10]:
df.withColumn("name", concat_ws(" ","firstname","lastname")) \
  .show()

+---------+----------+--------+----------+------+------+---------------+
|firstname|middlename|lastname|       dob|gender|salary|           name|
+---------+----------+--------+----------+------+------+---------------+
|    James|          |   Smith|1991-04-01|     M|  3000|    James Smith|
|  Michael|      Rose|        |2000-05-19|     M|  4000|       Michael |
|   Robert|          |Williams|1978-09-05|     M|  4000|Robert Williams|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|    Maria Jones|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|      Jen Brown|
+---------+----------+--------+----------+------+------+---------------+



## 4. Add New Column with Constant Value

In [11]:
# here lit sunction is used to create a constant values
from pyspark.sql.functions import lit
df.withColumn("bonus_percent", lit(0.3)) \
  .show()


+---------+----------+--------+----------+------+------+-------------+
|firstname|middlename|lastname|       dob|gender|salary|bonus_percent|
+---------+----------+--------+----------+------+------+-------------+
|    James|          |   Smith|1991-04-01|     M|  3000|          0.3|
|  Michael|      Rose|        |2000-05-19|     M|  4000|          0.3|
|   Robert|          |Williams|1978-09-05|     M|  4000|          0.3|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|          0.3|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|          0.3|
+---------+----------+--------+----------+------+------+-------------+



In [12]:
# Add New column with NULL
df.withColumn("DEFAULT_COL", lit(None)) \
  .show()

+---------+----------+--------+----------+------+------+-----------+
|firstname|middlename|lastname|       dob|gender|salary|DEFAULT_COL|
+---------+----------+--------+----------+------+------+-----------+
|    James|          |   Smith|1991-04-01|     M|  3000|       NULL|
|  Michael|      Rose|        |2000-05-19|     M|  4000|       NULL|
|   Robert|          |Williams|1978-09-05|     M|  4000|       NULL|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|       NULL|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|       NULL|
+---------+----------+--------+----------+------+------+-----------+



## 5. Add Column Value Based on Condition

In [13]:
# Add Column using when otherwise condition
from pyspark.sql.functions import when
df.withColumn("grade", \
   when((df.salary < 4000), lit("A")) \
     .when((df.salary >= 4000) & (df.salary <= 5000), lit("B")) \
     .otherwise(lit("C")) \
  ).show()

+---------+----------+--------+----------+------+------+-----+
|firstname|middlename|lastname|       dob|gender|salary|grade|
+---------+----------+--------+----------+------+------+-----+
|    James|          |   Smith|1991-04-01|     M|  3000|    A|
|  Michael|      Rose|        |2000-05-19|     M|  4000|    B|
|   Robert|          |Williams|1978-09-05|     M|  4000|    B|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|    B|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|    A|
+---------+----------+--------+----------+------+------+-----+



In [14]:
df.columns

['firstname', 'middlename', 'lastname', 'dob', 'gender', 'salary']

## 6. Add Column When not Exists on DataFrame

In [16]:
# By using the if condition checks whether the column is present in DataFrame or not.
#If a column is not present add a new column to the DataFrame.
if "dummy" not in df.columns:
    df1 = df.withColumn("dummy",lit(None))
df1.show()

+---------+----------+--------+----------+------+------+-----+
|firstname|middlename|lastname|       dob|gender|salary|dummy|
+---------+----------+--------+----------+------+------+-----+
|    James|          |   Smith|1991-04-01|     M|  3000| NULL|
|  Michael|      Rose|        |2000-05-19|     M|  4000| NULL|
|   Robert|          |Williams|1978-09-05|     M|  4000| NULL|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000| NULL|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1| NULL|
+---------+----------+--------+----------+------+------+-----+



## 7. Add Column to DataFrame using select()

In [19]:
df.select("firstname","salary",lit(0.3).alias("bonas")).show()

+---------+------+-----+
|firstname|salary|bonas|
+---------+------+-----+
|    James|  3000|  0.3|
|  Michael|  4000|  0.3|
|   Robert|  4000|  0.3|
|    Maria|  4000|  0.3|
|      Jen|    -1|  0.3|
+---------+------+-----+



In [22]:
df.select("firstname","salary",current_date().alias("Date")).show()


+---------+------+----------+
|firstname|salary|      Date|
+---------+------+----------+
|    James|  3000|2024-05-16|
|  Michael|  4000|2024-05-16|
|   Robert|  4000|2024-05-16|
|    Maria|  4000|2024-05-16|
|      Jen|    -1|2024-05-16|
+---------+------+----------+



In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from pyspark.sql.types import StructType, StructField, StringType,IntegerType

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

columns = ["firstname","middlename","lastname","dob","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.printSchema()
df.show(truncate=False)

df2 = df.withColumn("salary",col("salary").cast("Integer"))
df2.printSchema()
df2.show(truncate=False)

df3 = df.withColumn("salary",col("salary")*100)
df3.printSchema()
df3.show(truncate=False) 

df4 = df.withColumn("CopiedColumn",col("salary")* -1)
df4.printSchema()

df5 = df.withColumn("Country", lit("USA"))
df5.printSchema()

df6 = df.withColumn("Country", lit("USA")) \
   .withColumn("anotherColumn",lit("anotherValue"))
df6.printSchema()

df.withColumnRenamed("gender","sex") \
  .show(truncate=False) 
  
df4.drop("CopiedColumn") \
.show(truncate=False) 

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|dob       |gender|salary|
+---------+----------+--------+----------+------+------+
|James    |          |Smith   |1991-04-01|M     |3000  |
|Michael  |Rose      |        |2000-05-19|M     |4000  |
|Robert   |          |Williams|1978-09-05|M     |4000  |
|Maria    |Anne      |Jones   |1967-12-01|F     |4000  |
|Jen      |Mary      |Brown   |1980-02-17|F     |-1    |
+---------+----------+--------+----------+------+------+

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+------

## 
8  Add Column to DataFrame using SQL Expressionon