In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Assuming 'spark' is your SparkSession
spark = SparkSession.builder \
    .appName("Example") \
    .getOrCreate()

# Sample data
data = [(1, "John", {"city": "New York", "state": "NY"}),
        (2, "Alice", {"city": "Los Angeles", "state": "CA"}),
        (3, "Bob", {"city": "Chicago", "state": "IL"})]

# Create DataFrame
df = spark.createDataFrame(data, ["id", "name", "address"])

# Selecting required columns
result_df = df.select("id", "name", col("address.state").alias("state"), col("address.city").alias("city"))

result_df.show()

+---+-----+-----+-----------+
| id| name|state|       city|
+---+-----+-----+-----------+
|  1| John|   NY|   New York|
|  2|Alice|   CA|Los Angeles|
|  3|  Bob|   IL|    Chicago|
+---+-----+-----+-----------+



In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Assuming 'spark' is your SparkSession
spark = SparkSession.builder \
    .appName("Example") \
    .getOrCreate()

# Define schema for the DataFrame
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("salary", IntegerType(), True),
    StructField("address", StructType([
        StructField("city", StringType(), True),
        StructField("state", StringType(), True)
    ]), True)
])

# Sample data
data = [(1, "John", 50000, ("New York", "NY")),
        (2, "Alice", 60000, ("Los Angeles", "CA")),
        (3, "Bob", 70000, ("Chicago", "IL"))]

# Create DataFrame with specified schema
df = spark.createDataFrame(data, schema)

# Selecting required columns
result_df = df.select("id", "name", "salary", "address.city", "address.state")

result_df.show()

+---+-----+------+-----------+-----+
| id| name|salary|       city|state|
+---+-----+------+-----------+-----+
|  1| John| 50000|   New York|   NY|
|  2|Alice| 60000|Los Angeles|   CA|
|  3|  Bob| 70000|    Chicago|   IL|
+---+-----+------+-----------+-----+



In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import broadcast

# Assuming 'spark' is your SparkSession
spark = SparkSession.builder \
    .appName("Example") \
    .getOrCreate()

# Sample data for small DataFrame
small_data = [(1, "John"),
              (2, "Alice"),
              (3, "Bob")]

# Sample data for large DataFrame
large_data = [(1, "New York"),
              (2, "Los Angeles"),
              (3, "Chicago"),
              (4, "Houston"),
              (5, "Boston")]

# Create DataFrames
small_df = spark.createDataFrame(small_data, ["id", "name"])
large_df = spark.createDataFrame(large_data, ["id", "city"])

# Perform broadcast join
joined_df = large_df.join(broadcast(small_df), "id", "left")

# Show the result
joined_df.show()


+---+-----------+-----+
| id|       city| name|
+---+-----------+-----+
|  1|   New York| John|
|  2|Los Angeles|Alice|
|  3|    Chicago|  Bob|
|  4|    Houston| null|
|  5|     Boston| null|
+---+-----------+-----+



In [0]:
# Prapare data 
import pyspark
from pyspark.sql import SparkSession

emp = [(1,"Smith",-1,"2018","10","M",3000), \
    (2,"Rose",1,"2010","20","M",4000), \
    (3,"Williams",1,"2010","10","M",1000), \
    (4,"Jones",2,"2005","10","F",2000), \
    (5,"Brown",2,"2010","40","",-1), \
      (6,"Brown",2,"2010","50","",-1) \
  ]
empColumns = ["emp_id","name","superior_emp_id","year_joined", \
       "emp_dept_id","gender","salary"]

empDF = spark.createDataFrame(data=emp, schema = empColumns)
empDF.printSchema()
empDF.show(truncate=False)

dept = [("Finance",10), \
    ("Marketing",20), \
    ("Sales",30), \
    ("IT",40) \
  ]
deptColumns = ["dept_name","dept_id"]
deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF.printSchema()
deptDF.show(truncate=False)

# Inner join
empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"inner") \
     .show(truncate=False)

root
 |-- emp_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- superior_emp_id: long (nullable = true)
 |-- year_joined: string (nullable = true)
 |-- emp_dept_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+------+--------+---------------+-----------+-----------+------+------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+--------+---------------+-----------+-----------+------+------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |
|2     |Rose    |1              |2010       |20         |M     |4000  |
|3     |Williams|1              |2010       |10         |M     |1000  |
|4     |Jones   |2              |2005       |10         |F     |2000  |
|5     |Brown   |2              |2010       |40         |      |-1    |
|6     |Brown   |2              |2010       |50         |      |-1    |
+------+--------+---------------+-----------+-----------+------+-----