# What is DataFrame

API Documentation: https://spark.apache.org/docs/3.5.0/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html
https://spark.apache.org/docs/3.5.0/api/python/reference/pyspark.sql/dataframe.html

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local[*]") \
                    .appName("Create Dataframe") \
                    .getOrCreate()

In [2]:
spark

In [None]:
spark.sparkContext.defaultParallelism

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
customer_data = [
    ["C1","Pratap","16-12-1979",10000],
    ["C2","Sruthi","08-01-1984",20000],
    ["C3","Kiyanshita","24-08-2011",30000],
    ["C4","Nirupama","01-11-2022",40000]
]

customer_schema = StructType(fields = [
                            StructField(name="CustomerID", dataType=StringType(), nullable=True),
                            StructField(name="CustomerName", dataType=StringType(), nullable=True),
                            StructField(name="CustomerDoB", dataType=StringType(), nullable=True),
                            StructField(name="CustomerSalary", dataType=IntegerType(), nullable=True)
                                    ])

In [3]:
df = spark.createDataFrame(data=customer_data, schema=customer_schema)

In [4]:
df.printSchema()

root
 |-- CustomerID: string (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- CustomerDoB: string (nullable = true)
 |-- CustomerSalary: integer (nullable = true)



In [18]:
df.show()

+----------+------------+-----------+--------------+
|CustomerID|CustomerName|CustomerDoB|CustomerSalary|
+----------+------------+-----------+--------------+
|        C1|      Pratap| 16-12-1979|         10000|
|        C2|      Sruthi| 08-01-1984|         20000|
|        C3|  Kiyanshita| 24-08-2011|         30000|
|        C4|    Nirupama| 01-11-2022|         40000|
+----------+------------+-----------+--------------+



In [5]:
data = [
    ("James","","Smith","36636","M",3000),
    ("Michael","Rose","","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",-1)
  ]

schema = StructType([ \
    StructField("firstname",StringType(),True), \
    StructField("middlename",StringType(),True), \
    StructField("lastname",StringType(),True), \
    StructField("id", StringType(), True), \
    StructField("gender", StringType(), True), \
    StructField("salary", IntegerType(), True) \
  ])

In [6]:
df = spark.createDataFrame(data=data,schema=schema)

In [7]:
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [8]:
df.show()

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|salary|
+---------+----------+--------+-----+------+------+
|    James|          |   Smith|36636|     M|  3000|
|  Michael|      Rose|        |40288|     M|  4000|
|   Robert|          |Williams|42114|     M|  4000|
|    Maria|      Anne|   Jones|39192|     F|  4000|
|      Jen|      Mary|   Brown|     |     F|    -1|
+---------+----------+--------+-----+------+------+



## Defining Nested StructType object struct

In [8]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

structureData = [
    (("James","","Smith"),"36636","M",3100),
    (("Michael","Rose",""),"40288","M",4300),
    (("Robert","","Williams"),"42114","M",1400),
    (("Maria","Anne","Jones"),"39192","F",5500),
    (("Jen","Mary","Brown"),"","F",-1)
  ]

customer_name_stuct = StructType( fields =[
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ] )
structureSchema = StructType( fields = [
        StructField('name', customer_name_stuct, True),
         StructField('id', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

In [9]:
df2 = spark.createDataFrame(data=structureData,schema=structureSchema)

In [10]:
df2.printSchema()
# df2.show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [21]:
from pyspark.sql.functions import col
df2.select(df2.name["firstname"],df2.name["lastname"],df2[1] ).show()

+--------------+-------------+-----+
|name.firstname|name.lastname|   id|
+--------------+-------------+-----+
|         James|        Smith|36636|
|       Michael|             |40288|
|        Robert|     Williams|42114|
|         Maria|        Jones|39192|
|           Jen|        Brown|     |
+--------------+-------------+-----+



In [23]:

from pyspark.sql.types import StructType, StructField, StringType, ArrayType, MapType, DateType
# from pyspark.sql.functions import lit
import datetime

# Create SparkSession
spark = SparkSession.builder.appName("ComplexDataTypesExample").getOrCreate()

# Define schema
schema = StructType([
    StructField("name", StringType(), True),
    StructField("hobbies", ArrayType(StringType()), True),  # Array Type
    StructField("attributes", MapType(StringType(), StringType()), True),  # Map Type
    StructField("dob", DateType(), True)  # Date Type
])

# Sample data
data = [
    ("Alice", ["reading", "cycling"], {"height": "5.6", "weight": "55kg"}, datetime.date(1990, 1, 1)),
    ("Bob", ["swimming", "gaming"], {"height": "5.8", "weight": "70kg"}, datetime.date(1985, 5, 15)),
    ("Charlie", ["hiking", "travelling"], {"height": "6.0", "weight": "80kg"}, datetime.date(1992, 8, 25))
]

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Show DataFrame
df.show(truncate=False)

# Print Schema
df.printSchema()


+-------+--------------------+-------------------------------+----------+
|name   |hobbies             |attributes                     |dob       |
+-------+--------------------+-------------------------------+----------+
|Alice  |[reading, cycling]  |{weight -> 55kg, height -> 5.6}|1990-01-01|
|Bob    |[swimming, gaming]  |{weight -> 70kg, height -> 5.8}|1985-05-15|
|Charlie|[hiking, travelling]|{weight -> 80kg, height -> 6.0}|1992-08-25|
+-------+--------------------+-------------------------------+----------+

root
 |-- name: string (nullable = true)
 |-- hobbies: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- attributes: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- dob: date (nullable = true)



In [24]:

from pyspark.sql.types import StructType, StructField, StringType, ArrayType, MapType, DateType

import datetime

# Create SparkSession
spark = SparkSession.builder.appName("ComplexDataTypesWithNestedStruct").getOrCreate()

address_schema = StructType( fields = [  # Nested StructType
        StructField("city", StringType(), True),
        StructField("state", StringType(), True),
        StructField("zip", StringType(), True)
    ])
# Define schema
schema = StructType([
    StructField("name", StringType(), True),
    StructField("hobbies", ArrayType(StringType()), True),  # Array Type
    StructField("attributes", MapType(StringType(), StringType()), True),  # Map Type
    StructField("dob", DateType(), True),  # Date Type
    StructField("address",address_schema , True)
])

# Sample data
data = [
    ("Alice", ["reading", "cycling"], {"height": "5.6", "weight": "55kg"}, datetime.date(1990, 1, 1), {"city": "New York", "state": "NY", "zip": "10001"}),
    ("Bob", ["swimming", "gaming"], {"height": "5.8", "weight": "70kg"}, datetime.date(1985, 5, 15), {"city": "Los Angeles", "state": "CA", "zip": "90001"}),
    ("Charlie", ["hiking", "travelling"], {"height": "6.0", "weight": "80kg"}, datetime.date(1992, 8, 25), {"city": "Chicago", "state": "IL", "zip": "60601"})
]

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Show DataFrame
df.show(truncate=False)

# Print Schema
df.printSchema()


+-------+--------------------+-------------------------------+----------+------------------------+
|name   |hobbies             |attributes                     |dob       |address                 |
+-------+--------------------+-------------------------------+----------+------------------------+
|Alice  |[reading, cycling]  |{weight -> 55kg, height -> 5.6}|1990-01-01|{New York, NY, 10001}   |
|Bob    |[swimming, gaming]  |{weight -> 70kg, height -> 5.8}|1985-05-15|{Los Angeles, CA, 90001}|
|Charlie|[hiking, travelling]|{weight -> 80kg, height -> 6.0}|1992-08-25|{Chicago, IL, 60601}    |
+-------+--------------------+-------------------------------+----------+------------------------+

root
 |-- name: string (nullable = true)
 |-- hobbies: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- attributes: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- dob: date (nullable = true)
 |-- address: struct (nullable = t

In [15]:
df["address"]

Column<'address'>

In [16]:
df.address

Column<'address'>

In [18]:
df.select(df.address).show(truncate=False)

+------------------------+
|address                 |
+------------------------+
|{New York, NY, 10001}   |
|{Los Angeles, CA, 90001}|
|{Chicago, IL, 60601}    |
+------------------------+



In [19]:
from pyspark.sql.functions import col, explode, lit, map_keys, map_values

# Flattening StructType (Address fields)
df_flattened = df.select(
    col("name"),
    col("hobbies"),
    col("attributes"),
    col("dob"),
    col("address.city").alias("city"),
    col("address.state").alias("state"),
    col("address.zip").alias("zip")
)

# Exploding ArrayType (Hobbies)
df_flattened = df_flattened.withColumn("hobby", explode(col("hobbies"))).drop("hobbies")

# Exploding MapType (Attributes)
df_flattened = df_flattened.withColumn("attribute_key", explode(map_keys(col("attributes")))) \
                           .withColumn("attribute_value", col("attributes")[col("attribute_key")]) \
                           .drop("attributes")

# Show the flattened DataFrame
df_flattened.show(truncate=False)

# Print Schema
df_flattened.printSchema()


+-------+----------+-----------+-----+-----+----------+-------------+---------------+
|name   |dob       |city       |state|zip  |hobby     |attribute_key|attribute_value|
+-------+----------+-----------+-----+-----+----------+-------------+---------------+
|Alice  |1990-01-01|New York   |NY   |10001|reading   |weight       |55kg           |
|Alice  |1990-01-01|New York   |NY   |10001|reading   |height       |5.6            |
|Alice  |1990-01-01|New York   |NY   |10001|cycling   |weight       |55kg           |
|Alice  |1990-01-01|New York   |NY   |10001|cycling   |height       |5.6            |
|Bob    |1985-05-15|Los Angeles|CA   |90001|swimming  |weight       |70kg           |
|Bob    |1985-05-15|Los Angeles|CA   |90001|swimming  |height       |5.8            |
|Bob    |1985-05-15|Los Angeles|CA   |90001|gaming    |weight       |70kg           |
|Bob    |1985-05-15|Los Angeles|CA   |90001|gaming    |height       |5.8            |
|Charlie|1992-08-25|Chicago    |IL   |60601|hiking    