In [None]:
from pyspark.sql import SparkSession

spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.250:7077") \
        .appName("Part_B_Simon_Pislar_A3")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",2)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# RDD  API
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

In [None]:
df = spark_session.read.csv("hdfs://192.168.2.250:9000/parking-citations.csv", header=True, inferSchema=True)
df.show()

In [None]:
df.printSchema()

In [None]:
row_count = df.count()
print(f"Number of rows: {row_count}")

In [None]:
partition_count = df.rdd.getNumPartitions()
print(f"Number of partitions: {partition_count}")

In [None]:
df = df.drop('VIN', 'Latitude', 'Longitude')

In [None]:
from pyspark.sql.functions import col
from pyspark.sql.types import FloatType

df = df.withColumn("FineAmount", col("FineAmount").cast(FloatType()))
max_fine_amount = df.agg({"FineAmount": "max"}).collect()[0][0]
count_max_fine = df.where(col("FineAmount") == max_fine_amount).count()

print(f"Maximum fine amount: {max_fine_amount}")
print(f"Number of fines with maximum amount: {count_max_fine}")

In [None]:
from pyspark.sql.functions import desc

df.groupBy("Make").count().orderBy(desc("count")).show(20)