In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Country", StringType(), True),
    StructField("City", StringType(), True)
])

In [0]:
data = [
    ("Alice", 30, "Canada", "Toronto"),
    ("Bob", 25, "Canada", "Vancouver"),
    ("Charlie", 35, "USA", "New York"),
    ("David", 28, "USA", "Los Angeles"),
    ("Eva", 22, "Canada", "Montreal"),
    ("Frank", 40, "USA", "Chicago")
]

# Create DataFrame using the defined schema
df = spark.createDataFrame(data, schema)
df.printSchema()

# Display the DataFrame
df.show()


root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)

+-------+---+-------+-----------+
|   Name|Age|Country|       City|
+-------+---+-------+-----------+
|  Alice| 30| Canada|    Toronto|
|    Bob| 25| Canada|  Vancouver|
|Charlie| 35|    USA|   New York|
|  David| 28|    USA|Los Angeles|
|    Eva| 22| Canada|   Montreal|
|  Frank| 40|    USA|    Chicago|
+-------+---+-------+-----------+



In [0]:
# Write the DataFrame to disk partitioned by the "Country" column
output_path = "/FileStore/partitioned_data"
df.write.mode("overwrite").partitionBy("Country").parquet(output_path)



In [0]:
# Read the partitioned data
partitioned_df = spark.read.parquet(output_path)

# Display the data
partitioned_df.show()


+-------+---+-----------+-------+
|   Name|Age|       City|Country|
+-------+---+-----------+-------+
|  David| 28|Los Angeles|    USA|
|Charlie| 35|   New York|    USA|
|  Alice| 30|    Toronto| Canada|
|    Bob| 25|  Vancouver| Canada|
|  Frank| 40|    Chicago|    USA|
|    Eva| 22|   Montreal| Canada|
+-------+---+-----------+-------+

