In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [9]:
spark = (
    SparkSession.builder.appName("Test Spark")
    # run with 2 threads
    .master("local[2]")
    # use arrow
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")
    # use hdfs
    .config("spark.hadoop.fs.defaultFS", "hdfs://127.0.0.1:9000")
    .getOrCreate()
)

spark_context = spark.sparkContext.getOrCreate()

print("\n===== Spark Context Info =====")
print(f"App Name      : {spark_context.appName}")
print(f"Master        : {spark_context.master}")
print(f"Application ID: {spark_context.applicationId}")
print(f"UI Web URL    : {spark_context.uiWebUrl}")
print(f"Version       : {spark_context.version}")
print(f"Python Ver    : {spark_context.pythonVer}")


===== Spark Context Info =====
App Name      : Test Spark
Master        : local[2]
Application ID: local-1758076362518
UI Web URL    : http://10.197.178.163:4040
Version       : 4.0.1
Python Ver    : 3.11


In [10]:
schema = StructType(
    [
        StructField("name", StringType(), False),
        StructField("age", IntegerType(), False),
        StructField("city", StringType(), False),
    ]
)

df = spark.createDataFrame(
    [
        {"name": "Alice", "age": 29, "city": "New York"},
        {"name": "Bob", "age": 35, "city": "San Francisco"},
        {"name": "Charlie", "age": 40, "city": "London"},
        {"name": "Diana", "age": 23, "city": "Berlin"},
        {"name": "Ethan", "age": 31, "city": "Sydney"},
    ],
    schema,
)


df.show()
df.explain("extended")

+-------+---+-------------+
|   name|age|         city|
+-------+---+-------------+
|  Alice| 29|     New York|
|    Bob| 35|San Francisco|
|Charlie| 40|       London|
|  Diana| 23|       Berlin|
|  Ethan| 31|       Sydney|
+-------+---+-------------+

== Parsed Logical Plan ==
LogicalRDD [name#27, age#28, city#29], false

== Analyzed Logical Plan ==
name: string, age: int, city: string
LogicalRDD [name#27, age#28, city#29], false

== Optimized Logical Plan ==
LogicalRDD [name#27, age#28, city#29], false

== Physical Plan ==
*(1) Scan ExistingRDD[name#27,age#28,city#29]



In [11]:
df.write.mode("overwrite").parquet("hdfs:///users/nyeinchan/hdfs-data")

In [12]:
new_df = spark.read.parquet("hdfs:///users/nyeinchan/hdfs-data")

In [13]:
new_df.show()

+-------+---+-------------+
|   name|age|         city|
+-------+---+-------------+
|Charlie| 40|       London|
|  Diana| 23|       Berlin|
|  Ethan| 31|       Sydney|
|  Alice| 29|     New York|
|    Bob| 35|San Francisco|
+-------+---+-------------+



In [15]:
df = (
    spark.read.option("header", True)
    .option("inferSchema", True)
    .csv("hdfs:///users/nyeinchan/hdfs-data/airport/airport_codes.csv")
)

df.show(5)
df.printSchema()

+-----+-------------+--------------------+------------+---------+-----------+----------+------------+---------+---------+--------+----------+--------------------+
|ident|         type|                name|elevation_ft|continent|iso_country|iso_region|municipality|icao_code|iata_code|gps_code|local_code|         coordinates|
+-----+-------------+--------------------+------------+---------+-----------+----------+------------+---------+---------+--------+----------+--------------------+
|  00A|     heliport|   Total RF Heliport|          11|       NA|         US|     US-PA|    Bensalem|     null|     null|    K00A|       00A|40.070985, -74.93...|
| 00AA|small_airport|Aero B Ranch Airport|        3435|       NA|         US|     US-KS|       Leoti|     null|     null|    00AA|      00AA|38.704022, -101.4...|
| 00AK|small_airport|        Lowell Field|         450|       NA|         US|     US-AK|Anchor Point|     null|     null|    00AK|      00AK|59.947733, -151.6...|
| 00AL|small_airport| 