In [7]:
from pyspark.sql import SparkSession

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.1.153:7077")\
        .appName("B_LU")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()

In [8]:
# B.1
df = spark_session.read\
    .option("header", "true")\
    .csv("hdfs://192.168.1.153:9000/parking-citations.csv")\
    .cache()
df.show()

+-------------+-------------------+----------+--------+-----------+--------------+-----------------+----+----+----------+-----+--------------------+-----+------+--------------+---------------------+-----------+---------+---------+
|Ticket number|         Issue Date|Issue time|Meter Id|Marked Time|RP State Plate|Plate Expiry Date| VIN|Make|Body Style|Color|            Location|Route|Agency|Violation code|Violation Description|Fine amount| Latitude|Longitude|
+-------------+-------------------+----------+--------+-----------+--------------+-----------------+----+----+----------+-----+--------------------+-----+------+--------------+---------------------+-----------+---------+---------+
|   1103341116|2015-12-21T00:00:00|      1251|    null|       null|            CA|           200304|null|HOND|        PA|   GY|     13147 WELBY WAY|01521|     1|        4000A1|   NO EVIDENCE OF REG|         50|    99999|    99999|
|   1103700150|2015-12-21T00:00:00|      1435|    null|       null|         

In [9]:
# B.2
df.printSchema()

root
 |-- Ticket number: string (nullable = true)
 |-- Issue Date: string (nullable = true)
 |-- Issue time: string (nullable = true)
 |-- Meter Id: string (nullable = true)
 |-- Marked Time: string (nullable = true)
 |-- RP State Plate: string (nullable = true)
 |-- Plate Expiry Date: string (nullable = true)
 |-- VIN: string (nullable = true)
 |-- Make: string (nullable = true)
 |-- Body Style: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Route: string (nullable = true)
 |-- Agency: string (nullable = true)
 |-- Violation code: string (nullable = true)
 |-- Violation Description: string (nullable = true)
 |-- Fine amount: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)



In [10]:
# B.3
df.count()

9257460

In [11]:
# B.4
df.rdd.getNumPartitions()

10

In [12]:
# B.5
columns_to_drop = ['VIN', 'Latitude', 'Longitude']
df = df.drop(*columns_to_drop).cache()
df.printSchema()

root
 |-- Ticket number: string (nullable = true)
 |-- Issue Date: string (nullable = true)
 |-- Issue time: string (nullable = true)
 |-- Meter Id: string (nullable = true)
 |-- Marked Time: string (nullable = true)
 |-- RP State Plate: string (nullable = true)
 |-- Plate Expiry Date: string (nullable = true)
 |-- Make: string (nullable = true)
 |-- Body Style: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Route: string (nullable = true)
 |-- Agency: string (nullable = true)
 |-- Violation code: string (nullable = true)
 |-- Violation Description: string (nullable = true)
 |-- Fine amount: string (nullable = true)



In [25]:
# B.6
from pyspark.sql.types import FloatType
df = df.withColumn("Fine amount",
                   df["Fine amount"].cast(FloatType()).alias("Fine amount")).cache()
df.printSchema()
fine_max = df.agg({"Fine amount": "max"}).collect()[0][0]
print(fine_max)
print(df.filter(df["Fine amount"] == fine_max).count())

root
 |-- Ticket number: string (nullable = true)
 |-- Issue Date: string (nullable = true)
 |-- Issue time: string (nullable = true)
 |-- Meter Id: string (nullable = true)
 |-- Marked Time: string (nullable = true)
 |-- RP State Plate: string (nullable = true)
 |-- Plate Expiry Date: string (nullable = true)
 |-- Make: string (nullable = true)
 |-- Body Style: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Route: string (nullable = true)
 |-- Agency: string (nullable = true)
 |-- Violation code: string (nullable = true)
 |-- Violation Description: string (nullable = true)
 |-- Fine amount: float (nullable = true)

505.0
6


In [35]:
# B.7
df.select('Make', 'sum(Make)').groupby('Make').agg({'Make': 'sum'})


AnalysisException: "cannot resolve '`sum(Make)`' given input columns: [Plate Expiry Date, Issue Date, Issue time, Violation Description, Location, Marked Time, Route, Color, Body Style, Violation code, Ticket number, Fine amount, Meter Id, Make, Agency, RP State Plate];;\n'Project [Make#563, 'sum(Make)]\n+- Project [Ticket number#555, Issue Date#556, Issue time#557, Meter Id#558, Marked Time#559, RP State Plate#560, Plate Expiry Date#561, Make#563, Body Style#564, Color#565, Location#566, Route#567, Agency#568, Violation code#569, Violation Description#570, cast(Fine amount#1544 as float) AS Fine amount#2025]\n   +- Project [Ticket number#555, Issue Date#556, Issue time#557, Meter Id#558, Marked Time#559, RP State Plate#560, Plate Expiry Date#561, Make#563, Body Style#564, Color#565, Location#566, Route#567, Agency#568, Violation code#569, Violation Description#570, cast(Fine amount#1366 as float) AS Fine amount#1544]\n      +- Project [Ticket number#555, Issue Date#556, Issue time#557, Meter Id#558, Marked Time#559, RP State Plate#560, Plate Expiry Date#561, Make#563, Body Style#564, Color#565, Location#566, Route#567, Agency#568, Violation code#569, Violation Description#570, cast(Fine amount#1188 as float) AS Fine amount#1366]\n         +- Project [Ticket number#555, Issue Date#556, Issue time#557, Meter Id#558, Marked Time#559, RP State Plate#560, Plate Expiry Date#561, Make#563, Body Style#564, Color#565, Location#566, Route#567, Agency#568, Violation code#569, Violation Description#570, cast(Fine amount#571 as double) AS Fine amount#1188]\n            +- Project [Ticket number#555, Issue Date#556, Issue time#557, Meter Id#558, Marked Time#559, RP State Plate#560, Plate Expiry Date#561, Make#563, Body Style#564, Color#565, Location#566, Route#567, Agency#568, Violation code#569, Violation Description#570, Fine amount#571]\n               +- Relation[Ticket number#555,Issue Date#556,Issue time#557,Meter Id#558,Marked Time#559,RP State Plate#560,Plate Expiry Date#561,VIN#562,Make#563,Body Style#564,Color#565,Location#566,Route#567,Agency#568,Violation code#569,Violation Description#570,Fine amount#571,Latitude#572,Longitude#573] csv\n"