In [1]:
import findspark
findspark.init()
findspark.find()

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Interview_Problems").getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x0000014F9BD20AF0>


In [14]:
### Problem 1 - Flatten Nested Data ###
from pyspark.sql.functions import col, explode

# Sample nested JSON data
nested_data = [
    (1, "John", {"city": "New York", "state": "NY"}, [{"subject": "Math", "score": 90}, {"subject": "Science", "score": 85}]),
    (2, "Baby", {"city": "Los Angeles", "state": "CA"}, [{"subject": "Math", "score": 85}, {"subject": "Science", "score": 88}]),
    (3, "Emily", {"city": "Houston", "state": "TX"}, [{"subject": "Math", "score": 88}, {"subject": "Science", "score": 90}])
]

# Create a DataFrame from the nested data
df = spark.createDataFrame(nested_data, ["id", "name", "address", "grades"])

df.show(truncate=False)
# Map elements are accessed using .(Dot) Notation

# Flatten the nested data
df_flat = df.select(
    col("id"),
    col("name"),
    col("address.city").alias("address_city"),
    col("address.state").alias("address_state"),
    explode(col("grades")).alias("grade")
).select(
    col("id"),
    col("name"),
    col("address_city"),
    col("address_state"),
    col("grade.subject").alias("grades_subject"),
    col("grade.score").alias("grades_score")
)

# Nested structure where one entity has multiple nested elements (like multiple grades for a student), 
# when flattened, each combination of the outer entity and its nested elements becomes a separate row in the flattened table

# Show the flattened DataFrame
df_flat.show(truncate=False)

+---+-----+----------------------------------+-------------------------------------------------------------------+
|id |name |address                           |grades                                                             |
+---+-----+----------------------------------+-------------------------------------------------------------------+
|1  |John |{city -> New York, state -> NY}   |[{score -> 90, subject -> Math}, {score -> 85, subject -> Science}]|
|2  |Baby |{city -> Los Angeles, state -> CA}|[{score -> 85, subject -> Math}, {score -> 88, subject -> Science}]|
|3  |Emily|{city -> Houston, state -> TX}    |[{score -> 88, subject -> Math}, {score -> 90, subject -> Science}]|
+---+-----+----------------------------------+-------------------------------------------------------------------+

+---+-----+------------+-------------+--------------+------------+
|id |name |address_city|address_state|grades_subject|grades_score|
+---+-----+------------+-------------+--------------+-------