In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [17]:
spark = (
    SparkSession.builder.appName("Test Spark")
    # run with 2 threads
    .master("local[2]")
    # use arrow
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")
    # use hdfs
    .config("spark.hadoop.fs.defaultFS", "hdfs://127.0.0.1:9000")
    .getOrCreate()
)

spark_context = spark.sparkContext.getOrCreate()

print("\n===== Spark Context Info =====")
print(f"App Name      : {spark_context.appName}")
print(f"Master        : {spark_context.master}")
print(f"Application ID: {spark_context.applicationId}")
print(f"UI Web URL    : {spark_context.uiWebUrl}")
print(f"Version       : {spark_context.version}")
print(f"Python Ver    : {spark_context.pythonVer}")


===== Spark Context Info =====
App Name      : Test Spark
Master        : local[2]
Application ID: local-1758015496717
UI Web URL    : http://172.20.10.14:4040
Version       : 4.0.1
Python Ver    : 3.11


In [18]:
schema = StructType(
    [
        StructField("name", StringType(), False),
        StructField("age", IntegerType(), False),
        StructField("city", StringType(), False),
    ]
)

df = spark.createDataFrame(
    [
        {"name": "Alice", "age": 29, "city": "New York"},
        {"name": "Bob", "age": 35, "city": "San Francisco"},
        {"name": "Charlie", "age": 40, "city": "London"},
        {"name": "Diana", "age": 23, "city": "Berlin"},
        {"name": "Ethan", "age": 31, "city": "Sydney"},
    ],
    schema,
)


df.show()
df.explain("extended")

+-------+---+-------------+
|   name|age|         city|
+-------+---+-------------+
|  Alice| 29|     New York|
|    Bob| 35|San Francisco|
|Charlie| 40|       London|
|  Diana| 23|       Berlin|
|  Ethan| 31|       Sydney|
+-------+---+-------------+

== Parsed Logical Plan ==
LogicalRDD [name#54, age#55, city#56], false

== Analyzed Logical Plan ==
name: string, age: int, city: string
LogicalRDD [name#54, age#55, city#56], false

== Optimized Logical Plan ==
LogicalRDD [name#54, age#55, city#56], false

== Physical Plan ==
*(1) Scan ExistingRDD[name#54,age#55,city#56]



In [19]:
df.write.mode("overwrite").parquet("hdfs:///users/nyeinchan/hdfs-data")

In [20]:
new_df = spark.read.parquet("hdfs:///users/nyeinchan/hdfs-data")

In [21]:
new_df.show()

+-------+---+-------------+
|   name|age|         city|
+-------+---+-------------+
|Charlie| 40|       London|
|  Diana| 23|       Berlin|
|  Ethan| 31|       Sydney|
|  Alice| 29|     New York|
|    Bob| 35|San Francisco|
+-------+---+-------------+

