In [1]:
from pyspark.sql import SparkSession

ss = SparkSession.builder.appName("Create_Map()").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/23 12:05:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


24/02/23 12:05:26 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [4]:
sample_data = (
    (100, "Mobile", 20000, 10),
    (200, "Laptop", 85000, 12),
    (300, "Television", 45000, 8),
    (400, "Monitor", 7000, 9),
    (500, "Headset", 6500, 15),
)

defSchema = StructType(
    [
        StructField("Product_id", IntegerType(), False),
        StructField("Product_name", StringType(), False),
        StructField("Price", IntegerType(), False),
        StructField("DiscountPercent", IntegerType(), False),
    ]
)

df = ss.createDataFrame(data=sample_data, schema=defSchema)
df.show()

+----------+------------+-----+---------------+
|Product_id|Product_name|Price|DiscountPercent|
+----------+------------+-----+---------------+
|       100|      Mobile|20000|             10|
|       200|      Laptop|85000|             12|
|       300|  Television|45000|              8|
|       400|     Monitor| 7000|              9|
|       500|     Headset| 6500|             15|
+----------+------------+-----+---------------+



#### Create_map -Convert columns to Dictionary

In [5]:
from pyspark.sql.functions import col, lit, create_map

In [7]:
dfDict = df.select(
    col("Product_id"),
    col("Product_name"),
    col("Price"),
    col("DiscountPercent"),
    create_map(col("Product_name"), col("Price")).alias("PriceDict"),
)
dfDict.show(truncate=False)

+----------+------------+-----+---------------+---------------------+
|Product_id|Product_name|Price|DiscountPercent|PriceDict            |
+----------+------------+-----+---------------+---------------------+
|100       |Mobile      |20000|10             |{Mobile -> 20000}    |
|200       |Laptop      |85000|12             |{Laptop -> 85000}    |
|300       |Television  |45000|8              |{Television -> 45000}|
|400       |Monitor     |7000 |9              |{Monitor -> 7000}    |
|500       |Headset     |6500 |15             |{Headset -> 6500}    |
+----------+------------+-----+---------------+---------------------+



In [8]:
dfDict = df.withColumn(
    "PriceDict",
    create_map(lit("Product_name"), col("Product_name"), lit("Price"), col("Price")),
)
dfDict.show(truncate=False)

+----------+------------+-----+---------------+--------------------------------------------+
|Product_id|Product_name|Price|DiscountPercent|PriceDict                                   |
+----------+------------+-----+---------------+--------------------------------------------+
|100       |Mobile      |20000|10             |{Product_name -> Mobile, Price -> 20000}    |
|200       |Laptop      |85000|12             |{Product_name -> Laptop, Price -> 85000}    |
|300       |Television  |45000|8              |{Product_name -> Television, Price -> 45000}|
|400       |Monitor     |7000 |9              |{Product_name -> Monitor, Price -> 7000}    |
|500       |Headset     |6500 |15             |{Product_name -> Headset, Price -> 6500}    |
+----------+------------+-----+---------------+--------------------------------------------+



In [9]:
dfDict = df.withColumn(
    "PriceDict",
    create_map(
        lit("Product_name"),
        col("Product_name"),
        lit("Price"),
        col("Price"),
        lit("DiscountPercent"),
        col("DiscountPercent"),
    ),
)
dfDict.show(truncate=False)

+----------+------------+-----+---------------+------------------------------------------------------------------+
|Product_id|Product_name|Price|DiscountPercent|PriceDict                                                         |
+----------+------------+-----+---------------+------------------------------------------------------------------+
|100       |Mobile      |20000|10             |{Product_name -> Mobile, Price -> 20000, DiscountPercent -> 10}   |
|200       |Laptop      |85000|12             |{Product_name -> Laptop, Price -> 85000, DiscountPercent -> 12}   |
|300       |Television  |45000|8              |{Product_name -> Television, Price -> 45000, DiscountPercent -> 8}|
|400       |Monitor     |7000 |9              |{Product_name -> Monitor, Price -> 7000, DiscountPercent -> 9}    |
|500       |Headset     |6500 |15             |{Product_name -> Headset, Price -> 6500, DiscountPercent -> 15}   |
+----------+------------+-----+---------------+---------------------------------

In [10]:
df.printSchema()

root
 |-- Product_id: integer (nullable = false)
 |-- Product_name: string (nullable = false)
 |-- Price: integer (nullable = false)
 |-- DiscountPercent: integer (nullable = false)



In [11]:
dfDict.printSchema()

root
 |-- Product_id: integer (nullable = false)
 |-- Product_name: string (nullable = false)
 |-- Price: integer (nullable = false)
 |-- DiscountPercent: integer (nullable = false)
 |-- PriceDict: map (nullable = false)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = false)

