In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

# Create a Spark session
spark = SparkSession.builder.appName("Parking Violation Classifier").getOrCreate()

# Load the dataset
df = spark.read.format("csv").option("header", "true").load("parking_violation.csv")

# Select relevant features and target variable
features = ['Plate Type', 'Vehicle Body Type', 'Vehicle Make', 'Violation Location', 'Registration State', 'Vehicle Year']
target = 'Violation Code'

# Define the pipeline stages
assembler = VectorAssembler(inputCols=features, outputCol="features")
clf = RandomForestClassifier(numTrees=100, seed=42, labelCol=target)

# Create a pipeline
pipeline = Pipeline(stages=[assembler, clf])

# Split the data into training and testing sets
(training_data, testing_data) = df.randomSplit([0.8, 0.2], seed=42)

# Train the pipeline
model = pipeline.fit(training_data)

# Make predictions on the test set
predictions = model.transform(testing_data)

# Evaluate the accuracy of the model
evaluator = MulticlassClassificationEvaluator(labelCol=target, predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

# Predict the violation type for a new parking ticket
new_ticket = spark.createDataFrame([(['PAS', 'SUBN', 'ACURA', 14, 'NY', 2018],)], schema=["Plate Type", "Vehicle Body Type", "Vehicle Make", "Violation Location", "Registration State", "Vehicle Year"])
predicted_violation = model.transform(new_ticket).select("prediction").collect()[0][0]
print(f"Predicted violation type: {predicted_violation}")
