<a href="https://colab.research.google.com/github/SanjayJanardhan-89/ApacheSparkHandsOn/blob/main/ComplexDatatypes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup Pyspark


In [1]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#Check this site for the latest download link https://www.apache.org/dyn/closer.lua/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

import os
import sys
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"


import findspark
findspark.init()
findspark.find()

import pyspark
from pyspark.sql import SparkSession

spark= SparkSession \
       .builder \
       .appName("OurSparkApp") \
       .getOrCreate()

spark

Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,804 kB]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:10 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,065 kB]
Get:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease [24.3 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,566 kB]
Hit:13 https://ppa.launchpad

# Play with Arrays


## Length of the array

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import size

spark = SparkSession.builder.getOrCreate()

data = [
    (1, [1, 2, 3]),
    (2, [4, 5]),
    (3, [])
]
df = spark.createDataFrame(data, ["id", "numbers"])

df.show(truncate=False)

+---+---------+
|id |numbers  |
+---+---------+
|1  |[1, 2, 3]|
|2  |[4, 5]   |
|3  |[]       |
+---+---------+



In [None]:
from pyspark.sql.functions import size, col

df_with_size = df.withColumn("numbers_size", size("numbers"))
df_with_size = df_with_size.withColumn("numbers_size_2", size(col("numbers")))

df_with_size.show()

+---+---------+------------+--------------+
| id|  numbers|numbers_size|numbers_size_2|
+---+---------+------------+--------------+
|  1|[1, 2, 3]|           3|             3|
|  2|   [4, 5]|           2|             2|
|  3|       []|           0|             0|
+---+---------+------------+--------------+



## Create array using Spark SQL

In [None]:
df_sql_array = spark.sql("SELECT array('KGF 1', 'KGF 2', 'Autograph', 'Kicha','Hucha') as movies")
df_sql_array.printSchema()
df_sql_array.show(truncate=False)

root
 |-- movies: array (nullable = false)
 |    |-- element: string (containsNull = false)

+---------------------------------------+
|movies                                 |
+---------------------------------------+
|[KGF 1, KGF 2, Autograph, Kicha, Hucha]|
+---------------------------------------+



In [None]:
from pyspark.sql.types import StructType,StructField, StringType, ArrayType,MapType

data = [
          [
            ["KGF 1", "KGF 2", "Autograph", "Kicha","Hucha"],
            # ["Hello", "Hi"]
          ]
        ]

# Schema
schema = StructType([
             StructField('movies', ArrayType(StringType()), True),
     ])

# Create DataFrame
df_array = spark.createDataFrame(data = data, schema = schema)
df_array.printSchema()
df_array.show(truncate=False) # shows all columns

root
 |-- movies: array (nullable = true)
 |    |-- element: string (containsNull = true)

+---------------------------------------+
|movies                                 |
+---------------------------------------+
|[KGF 1, KGF 2, Autograph, Kicha, Hucha]|
+---------------------------------------+



## Merge simple arrays


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat, array

spark = SparkSession.builder.getOrCreate()

data = [
    (1, [1, 2], [1,3, 4]),
    (2, [5, 6], [7, 8]),
]
df = spark.createDataFrame(data, ["id", "arr1", "arr2"])

# concat arrays
df2 = df.withColumn("merged", concat("arr1", "arr2"))

df2.show(truncate=False)


+---+------+---------+---------------+
|id |arr1  |arr2     |merged         |
+---+------+---------+---------------+
|1  |[1, 2]|[1, 3, 4]|[1, 2, 1, 3, 4]|
|2  |[5, 6]|[7, 8]   |[5, 6, 7, 8]   |
+---+------+---------+---------------+



In [8]:
##Concat cant not handle nested array

from pyspark.sql.functions import flatten, concat

df = spark.createDataFrame([
    (1, [[1, 2], [3, 4]]),
    (2, [[5, 6]])
], ["id", "nested_array"])

df.select("id","nested_array",concat("nested_array")).show(truncate=False)


+---+----------------+--------------------+
|id |nested_array    |concat(nested_array)|
+---+----------------+--------------------+
|1  |[[1, 2], [3, 4]]|[[1, 2], [3, 4]]    |
|2  |[[5, 6]]        |[[5, 6]]            |
+---+----------------+--------------------+



## Flatten Nested Array

In [2]:
from pyspark.sql.functions import flatten

df = spark.createDataFrame([
    (1, [[1, 2], [3, 4]]),
    (2, [[5, 6]])
], ["id", "nested_array"])

df.select(flatten("nested_array")).show(truncate=False)


+---------------------+
|flatten(nested_array)|
+---------------------+
|[1, 2, 3, 4]         |
|[5, 6]               |
+---------------------+



In [5]:
from pyspark.sql.functions import flatten

# Sample data: each row includes a product ID and nested arrays of review key phrases
data = [
    (1, [["great battery life", "sleek design"], ["heavy", "expensive"], ["sleek design"]]),
    (2, [["easy to install", "value for money"], ["requires maintenance"]])
]

# Create DataFrame
df = spark.createDataFrame(data, ["product_id", "reviews"])

# Show the original DataFrame
df.show(truncate=False)

# Flatten the nested array of reviews into a single array per product
flattened_df = df.withColumn("flattened_reviews", flatten(df["reviews"]))

# Show the DataFrame with the flattened reviews
flattened_df.show(truncate=False)

+----------+------------------------------------------------------------------------+
|product_id|reviews                                                                 |
+----------+------------------------------------------------------------------------+
|1         |[[great battery life, sleek design], [heavy, expensive], [sleek design]]|
|2         |[[easy to install, value for money], [requires maintenance]]            |
+----------+------------------------------------------------------------------------+

+----------+------------------------------------------------------------------------+------------------------------------------------------------------+
|product_id|reviews                                                                 |flattened_reviews                                                 |
+----------+------------------------------------------------------------------------+------------------------------------------------------------------+
|1         |[[great batt

In [13]:
from pyspark.sql.functions import flatten
# Sample DataFrame creation
recs_df = spark.createDataFrame([
    (1, [["prodA", "prodB"], ["prodC"]]),
    (2, [["prodD"], ["prodE", "prodF"]])
], ["user_id", "recommendations"])

# without flattening
recs_df.show(truncate=False)

flattened_recs = recs_df.select("user_id", flatten("recommendations").alias("all_recs"))
flattened_recs.show(truncate=False)

+-------+-------------------------+
|user_id|recommendations          |
+-------+-------------------------+
|1      |[[prodA, prodB], [prodC]]|
|2      |[[prodD], [prodE, prodF]]|
+-------+-------------------------+

+-------+---------------------+
|user_id|all_recs             |
+-------+---------------------+
|1      |[prodA, prodB, prodC]|
|2      |[prodD, prodE, prodF]|
+-------+---------------------+



## Explode

In [18]:
from pyspark.sql.functions import explode
df_exploded = recs_df.withColumn("all_recs", explode(flatten("recommendations")))
df_exploded.show(truncate=False)

+-------+-------------------------+--------+
|user_id|recommendations          |all_recs|
+-------+-------------------------+--------+
|1      |[[prodA, prodB], [prodC]]|prodA   |
|1      |[[prodA, prodB], [prodC]]|prodB   |
|1      |[[prodA, prodB], [prodC]]|prodC   |
|2      |[[prodD], [prodE, prodF]]|prodD   |
|2      |[[prodD], [prodE, prodF]]|prodE   |
|2      |[[prodD], [prodE, prodF]]|prodF   |
+-------+-------------------------+--------+



In [20]:
from pyspark.sql.functions import explode
df_exploded = flattened_recs.withColumn("all_recs", explode("all_recs"))
df_exploded.show()

+-------+--------+
|user_id|all_recs|
+-------+--------+
|      1|   prodA|
|      1|   prodB|
|      1|   prodC|
|      2|   prodD|
|      2|   prodE|
|      2|   prodF|
+-------+--------+



In [24]:
from pyspark.sql.functions import explode_outer

# Handling nulls and empties
nullable_df = spark.createDataFrame([
    (1, ["apple", "banana"]),
    (2, []),
    (3, None)
], ["id", "fruits"])

# Applying explode_outer
nullable_exploded = nullable_df.select("id", explode("fruits").alias("fruit"))
nullable_exploded.show()


nullable_df.select("id", explode_outer("fruits").alias("fruit")).show()

+---+------+
| id| fruit|
+---+------+
|  1| apple|
|  1|banana|
+---+------+

+---+------+
| id| fruit|
+---+------+
|  1| apple|
|  1|banana|
|  2|  NULL|
|  3|  NULL|
+---+------+



## Merge arrays - Using Array Union


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import array_union

spark = SparkSession.builder.getOrCreate()

data = [
    (1, [1, 2, 3], [3, 4, 5]),
    (2, [5, 6], [6, 7, 8])
]
df = spark.createDataFrame(data, ["id", "arr1", "arr2"])

df2 = df.withColumn("union_array", array_union("arr1", "arr2"))

df2.show(truncate=False)

+---+---------+---------+---------------+
|id |arr1     |arr2     |union_array    |
+---+---------+---------+---------------+
|1  |[1, 2, 3]|[3, 4, 5]|[1, 2, 3, 4, 5]|
|2  |[5, 6]   |[6, 7, 8]|[5, 6, 7, 8]   |
+---+---------+---------+---------------+



# Play with Map


**Creating Map type**

In [None]:
df_sql_map = spark.sql("SELECT map('Building','500 CR', 'Commercal',100) as income")
df_sql_map.printSchema()
df_sql_map.show(truncate=False)

root
 |-- income: map (nullable = false)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = false)

+--------------------------------------+
|income                                |
+--------------------------------------+
|{Building -> 500 CR, Commercal -> 100}|
+--------------------------------------+



In [None]:
from pyspark.sql import Row

df = spark.createDataFrame([
                              Row({"Bank":"100 CR", "Business":"50 CR", "Land":"150 CR"}),
                              Row({"Others":"300 CR"}),
                              Row({"Building":"500 CR", "Commercal":100}),
                              Row({"Building":100 , "Commercal":100}),
                            ]
                            , ["Assests"])
df.show(truncate=False)

+---------------------------------------------------+
|Assests                                            |
+---------------------------------------------------+
|{Bank -> 100 CR, Land -> 150 CR, Business -> 50 CR}|
|{Others -> 300 CR}                                 |
|{Building -> 500 CR, Commercal -> 100}             |
|{Building -> 100, Commercal -> 100}                |
+---------------------------------------------------+



In [None]:
from pyspark.sql.types import StructType,StructField, StringType, ArrayType,MapType


data = [

          ({"Bank":"100 CR", "Business":"50 CR", "Land":"150 CR"}),
          ({"Others":"300 CR"}),
          # ({"Building":"500 CR", "Commercal":100}),
          # ({"Building":"500 CR", "Commercal":100}),

      ]


# Schema
schema = StructType([
               StructField('properties', MapType(StringType(),StringType()), True)
        ])

# Create DataFrame
df_map = spark.createDataFrame(data = data, schema = schema)
df_map.printSchema()
df_map.show(truncate=False) # shows all columns

root
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+----------+
|properties|
+----------+
|NULL      |
|NULL      |
+----------+



In [None]:
|spark.sql("SELECT struct(1, 2, 3) as ex_struct")

DataFrame[ex_struct: struct<col1:int,col2:int,col3:int>]

## Merge struct

In [None]:
from pyspark.sql.functions import map_concat

# Sample Data
data = [
    (1, {"a": 1, "b": 2}, {"c": 3, "d": 4}),
    (2, {"x": 10}, {"y": 20, "z": 30}),
    (3, {}, {"p": 100, "q": 200}),
    (4, {"k": 5}, {})
]

df = spark.createDataFrame(data, ["id", "map1", "map2"])

print("before the merge")
df.show(truncate=False)

# Merge the two map columns
df_merged = df.withColumn("merged_map", map_concat("map1", "map2"))

print("after the merge")
df_merged.show(truncate=False)


before the merge
+---+----------------+--------------------+
|id |map1            |map2                |
+---+----------------+--------------------+
|1  |{a -> 1, b -> 2}|{d -> 4, c -> 3}    |
|2  |{x -> 10}       |{y -> 20, z -> 30}  |
|3  |{}              |{p -> 100, q -> 200}|
|4  |{k -> 5}        |{}                  |
+---+----------------+--------------------+

after the merge
+---+----------------+--------------------+--------------------------------+
|id |map1            |map2                |merged_map                      |
+---+----------------+--------------------+--------------------------------+
|1  |{a -> 1, b -> 2}|{d -> 4, c -> 3}    |{a -> 1, b -> 2, d -> 4, c -> 3}|
|2  |{x -> 10}       |{y -> 20, z -> 30}  |{x -> 10, y -> 20, z -> 30}     |
|3  |{}              |{p -> 100, q -> 200}|{p -> 100, q -> 200}            |
|4  |{k -> 5}        |{}                  |{k -> 5}                        |
+---+----------------+--------------------+--------------------------------

# Play with Struct


In [None]:
df_struct = spark.sql("SELECT struct(1, 2, '3') as ex_struct")
df_struct.show()

+---------+
|ex_struct|
+---------+
|{1, 2, 3}|
+---------+



In [None]:
df_struct.select("ex_struct.col3", "ex_struct").show()

+----+---------+
|col3|ex_struct|
+----+---------+
|   3|{1, 2, 3}|
+----+---------+



In [None]:
df_struct.select("ex_struct.*").show(truncate=False)

+----+----+----+
|col1|col2|col3|
+----+----+----+
|1   |2   |3   |
+----+----+----+



In [None]:
df_map = spark.sql("SELECT map(1.0, '2', 3.0, '4') as ex_map")
df_map.show()

+--------------------+
|              ex_map|
+--------------------+
|{1.0 -> 2, 3.0 -> 4}|
+--------------------+



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, ArrayType,MapType

# https://sparkbyexamples.com/pyspark/pyspark-maptype-dict-examples/

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

# Data
data = [
        (("James",None,"Smith"),"OH","M"),
        (("Anna","Rose",""),"NY","F"),
        (("Julia","","Williams"),"OH","F"),
        (("Maria","Anne","Jones"),"NY","M"),
        (("Jen","Mary","Brown"),"NY","M"),
        (("Mike","Mary","Williams"),"OH","M")
        ]

# Schema
schema = StructType([
    StructField('name', StructType([
         StructField('firstname', StringType(), True),
         StructField('middlename', StringType(), True),
         StructField('lastname', StringType(), True)
         ])),
     StructField('state', StringType(), True),
     StructField('gender', StringType(), True)
     ])

# Create DataFrame
df2 = spark.createDataFrame(data = data, schema = schema)
df2.printSchema()
df2.show(truncate=False) # shows all columns

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)

+----------------------+-----+------+
|name                  |state|gender|
+----------------------+-----+------+
|{James, NULL, Smith}  |OH   |M     |
|{Anna, Rose, }        |NY   |F     |
|{Julia, , Williams}   |OH   |F     |
|{Maria, Anne, Jones}  |NY   |M     |
|{Jen, Mary, Brown}    |NY   |M     |
|{Mike, Mary, Williams}|OH   |M     |
+----------------------+-----+------+



In [None]:
df2.select("name").show()

+--------------------+
|                name|
+--------------------+
|{James, NULL, Smith}|
|      {Anna, Rose, }|
| {Julia, , Williams}|
|{Maria, Anne, Jones}|
|  {Jen, Mary, Brown}|
|{Mike, Mary, Will...|
+--------------------+



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, ArrayType,MapType
from pyspark.sql.functions import element_at

data = [
        ([("Yash","K",None),("Yash2","K2",None)],"BL","M",["KGF 1", "KGF 2"],{"Bank":"100 CR", "Business":"50 CR", "Land":"150 CR"}),
        ([("Sudeep","Kicha","S")],"DL","M",["Autograph", "Kicha","Hucha"],{"Others":"300 CR"}),
        ([("Puneeth",None,"Raj")],"MB","M",[], {"Building":"500 CR", "Commercal":100}),
        ([("Darshan",None,None)],"MB","M",[], {"Building":"500 CR", "Commercal":100}),

        ]

# Schema
schema = StructType([
    StructField(    'name'
                  , ArrayType(
                        StructType([
                            StructField('firstname', StringType(), True),
                            StructField('middlename', StringType(), True),
                            StructField('lastname', StringType(), True)
                        ])
                    ), True),
     StructField('state', StringType(), True),
     StructField('gender', StringType(), True),
     StructField('movies', ArrayType(StringType()), True),
     StructField('properties', MapType(StringType(),StringType()), True)
     ])

# Create DataFrame
df2 = spark.createDataFrame(data = data, schema = schema)
df2.printSchema()
df2.show(truncate=False) # shows all columns

root
 |-- name: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- firstname: string (nullable = true)
 |    |    |-- middlename: string (nullable = true)
 |    |    |-- lastname: string (nullable = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- movies: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+------------------------------------+-----+------+-------------------------+---------------------------------------------------+
|name                                |state|gender|movies                   |properties                                         |
+------------------------------------+-----+------+-------------------------+---------------------------------------------------+
|[{Yash, K, NULL}, {Yash2, K2, NULL}]|BL   |M     |[KGF 1, KGF 2]           |{Bank -> 100 CR, La

In [None]:
 (df2
  .select("properties", "name", "name.firstname")
  .withColumn("M-Building", col("properties").getItem("Building"))
  .withColumn("M-Commercal ", col("properties").getItem("Commercal"))
  .withColumn("M-Others ", col("properties").getItem("Others"))
  .withColumn("S-FName", col("name.firstname"))
  ).show()

+--------------------+--------------------+---------+----------+------------+---------+-------+
|          properties|                name|firstname|M-Building|M-Commercal |M-Others |S-FName|
+--------------------+--------------------+---------+----------+------------+---------+-------+
|{Bank -> 100 CR, ...|     {Yash, K, NULL}|     Yash|      NULL|        NULL|     NULL|   Yash|
|  {Others -> 300 CR}|  {Sudeep, Kicha, S}|   Sudeep|      NULL|        NULL|   300 CR| Sudeep|
|{Building -> 500 ...|{Puneeth, NULL, Raj}|  Puneeth|    500 CR|         100|     NULL|Puneeth|
|{Building -> 500 ...|{Darshan, NULL, N...|  Darshan|    500 CR|         100|     NULL|Darshan|
+--------------------+--------------------+---------+----------+------------+---------+-------+



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, ArrayType,MapType
from pyspark.sql.functions import element_at

data = [
        (("Yash","K",None),"BL","M",["KGF 1", "KGF 2"],{"Bank":"100 CR", "Business":"50 CR", "Land":"150 CR"}),
        (("Sudeep","Kicha","S"),"DL","M",["Autograph", "Kicha","Hucha"],{"Others":"300 CR"}),
        (("Puneeth",None,"Raj"),"MB","M",[], {"Building":"500 CR", "Commercal":100}),
        (("Darshan",None,None),"MB","M",[], {"Building":"500 CR", "Commercal":100}),

        ]

# Schema
schema = StructType([
    StructField('name', StructType([
         StructField('firstname', StringType(), True),
         StructField('middlename', StringType(), True),
         StructField('lastname', StringType(), True)
         ])),
     StructField('state', StringType(), True),
     StructField('gender', StringType(), True),
     StructField('movies', ArrayType(StringType()), True),
     StructField('properties', MapType(StringType(),StringType()), True)
     ])

# Create DataFrame
df2 = spark.createDataFrame(data = data, schema = schema)
df2.printSchema()
df2.show(truncate=False) # shows all columns

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- movies: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+---------------------+-----+------+-------------------------+---------------------------------------------------+
|name                 |state|gender|movies                   |properties                                         |
+---------------------+-----+------+-------------------------+---------------------------------------------------+
|{Yash, K, NULL}      |BL   |M     |[KGF 1, KGF 2]           |{Bank -> 100 CR, Land -> 150 CR, Business -> 50 CR}|
|{Sudeep, Kicha, S}   |DL   |M     |[Autograph, Kicha, Hucha]|{Others -> 300 CR}        

In [None]:
from pyspark.sql.types import StructField, StructType, StringType, LongType

myManualSchema = StructType([
  StructField("DEST_COUNTRY_NAME", StringType(), True),
  StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
  StructField("count", LongType(), False, metadata={"hello":"world"})
])
df = spark.read.format("json").schema(myManualSchema)\
  .load("sample_data/2015-summary.json")

df.show()

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/content/sample_data/2015-summary.json.

In [None]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [None]:
spark.read.format("json").load("sample_data/2015-summary.json").schema

StructType([StructField('DEST_COUNTRY_NAME', StringType(), True), StructField('ORIGIN_COUNTRY_NAME', StringType(), True), StructField('count', LongType(), True)])

# Other Examples



## Using expressions

In [None]:
from pyspark.sql import Row
from pyspark.sql.functions import col, expr

df = spark.createDataFrame([
  Row(name="san", salary=1500),
  Row(name="ana", salary=2000),
  Row(name="shu", salary=1000)
])

print("approach 1:")
df.withColumn("bonus", df.salary * 0.1).show()

print("approach 2:")
df.withColumn("bonus", expr("salary * 0.1")).show()

print("approach 3:")
df.select(col("name"), col("salary"), (col("salary") * 0.1).alias("bonus")).show()

approach 1:
+----+------+-----+
|name|salary|bonus|
+----+------+-----+
| san|  1500|150.0|
| ana|  2000|200.0|
| shu|  1000|100.0|
+----+------+-----+

approach 2:
+----+------+-----+
|name|salary|bonus|
+----+------+-----+
| san|  1500|150.0|
| ana|  2000|200.0|
| shu|  1000|100.0|
+----+------+-----+

approach 3:
+----+------+-----+
|name|salary|bonus|
+----+------+-----+
| san|  1500|150.0|
| ana|  2000|200.0|
| shu|  1000|100.0|
+----+------+-----+

