In [None]:
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql import functions as F, types as T
from pyspark.rdd import RDD

In [None]:
spark: SparkSession = SparkSession.builder \
    .appName("Parallel Array Processing") \
    .master("local[2]") \
    .getOrCreate()
sc = spark.sparkContext
print(f"Monitor cluster at: {sc.uiWebUrl}")

### Generate and process resilient distributed dataset (RDD) from an array on the driver

In [None]:
data = [1, 2, 3, 4, 5]
rdd: RDD[int] = sc.parallelize(data)
result = rdd.map(lambda x: x * 2).collect()
print(result)  # Output: [2, 4, 6, 8, 10]

### Generate and process a DataFrame from a .csv file on the driver 

In [None]:
higgs_data: DataFrame = spark.read.csv(
    "../data/HIGGS.csv",
    header=False,
    inferSchema=True
)
higgs_data.printSchema()

In [None]:
(
    higgs_data
    .agg(
        F.mean(F.col("_c0")).alias("mean_c0"),
        F.stddev(F.col("_c0")).alias("stddev_c0"),
    )
    .toPandas()
)

### Free cluster resources

In [None]:
spark.stop()