In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=eacfc9f478db364997f97789570c947db081ab35469a553efd0866fb0870cb2b
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
#naive bayes
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Read data from the vehicle_stolen_dataset.csv
spark=SparkSession.builder.appName("bayesclass").getOrCreate()
data=spark.read.csv('vehicle_stolen_dataset.csv',inferSchema=True)
data.show()

+----+------+-----+-----+---+
| _c0|   _c1|  _c2|  _c3|_c4|
+----+------+-----+-----+---+
|N001|   BMW|black|night|yes|
|N002|  Audi|black|night| no|
|N003|NISSAN|black|night|yes|
|N004|  VEGA|  red|  day|yes|
|N005|   BMW| blue|  day| no|
|N006|  Audi|black|  day|yes|
|N007|  VEGA|  red|night| no|
|N008|  Audi| blue|  day|yes|
|N009|  VEGA|black|  day|yes|
|N010|NISSAN| blue|  day| no|
|N011|   BMW|black|night|yes|
|N012|NISSAN|  red|  day| no|
|N013|  VEGA|black|night|yes|
|N014|   BMW|  red|  day| no|
|N015|  Audi|black|  day|yes|
|N016|  Audi| blue|night|yes|
|N017|  Audi|  red|  day| no|
|N018|NISSAN|black|  day|yes|
|N019|   BMW| blue|  day|yes|
|N020|   BMW|  red|night|yes|
+----+------+-----+-----+---+



In [3]:
data.columns

['_c0', '_c1', '_c2', '_c3', '_c4']

In [4]:
vehicle_df = data.select(col("_c0").alias("number_plate"),  col("_c1").alias("brand"),
col("_c2").alias("color"),
col("_c3").alias("time"),
col("_c4").alias("stoled"))

In [5]:
indexers = [
StringIndexer(inputCol="brand", outputCol = "brand_index"),
StringIndexer(inputCol="color", outputCol = "color_index"),  StringIndexer(inputCol="time", outputCol = "time_index"),  StringIndexer(inputCol="stoled", outputCol = "label")]
pipeline = Pipeline(stages=indexers)
#Fitting a model to the input dataset.
indexed_vehicle_df = pipeline.fit(vehicle_df).transform(vehicle_df)
indexed_vehicle_df.show(5,False)

+------------+------+-----+-----+------+-----------+-----------+----------+-----+
|number_plate|brand |color|time |stoled|brand_index|color_index|time_index|label|
+------------+------+-----+-----+------+-----------+-----------+----------+-----+
|N001        |BMW   |black|night|yes   |1.0        |0.0        |1.0       |0.0  |
|N002        |Audi  |black|night|no    |0.0        |0.0        |1.0       |1.0  |
|N003        |NISSAN|black|night|yes   |2.0        |0.0        |1.0       |0.0  |
|N004        |VEGA  |red  |day  |yes   |3.0        |1.0        |0.0       |0.0  |
|N005        |BMW   |blue |day  |no    |1.0        |2.0        |0.0       |1.0  |
+------------+------+-----+-----+------+-----------+-----------+----------+-----+
only showing top 5 rows



In [6]:
vectorAssembler = VectorAssembler(inputCols = ["brand_index", "color_index", "time_index"],outputCol = "features")
vindexed_vehicle_df = vectorAssembler.transform(indexed_vehicle_df)
vindexed_vehicle_df.show(5, False)

+------------+------+-----+-----+------+-----------+-----------+----------+-----+-------------+
|number_plate|brand |color|time |stoled|brand_index|color_index|time_index|label|features     |
+------------+------+-----+-----+------+-----------+-----------+----------+-----+-------------+
|N001        |BMW   |black|night|yes   |1.0        |0.0        |1.0       |0.0  |[1.0,0.0,1.0]|
|N002        |Audi  |black|night|no    |0.0        |0.0        |1.0       |1.0  |[0.0,0.0,1.0]|
|N003        |NISSAN|black|night|yes   |2.0        |0.0        |1.0       |0.0  |[2.0,0.0,1.0]|
|N004        |VEGA  |red  |day  |yes   |3.0        |1.0        |0.0       |0.0  |[3.0,1.0,0.0]|
|N005        |BMW   |blue |day  |no    |1.0        |2.0        |0.0       |1.0  |[1.0,2.0,0.0]|
+------------+------+-----+-----+------+-----------+-----------+----------+-----+-------------+
only showing top 5 rows



In [7]:
indexed_vehicle_df.show(3)

+------------+------+-----+-----+------+-----------+-----------+----------+-----+
|number_plate| brand|color| time|stoled|brand_index|color_index|time_index|label|
+------------+------+-----+-----+------+-----------+-----------+----------+-----+
|        N001|   BMW|black|night|   yes|        1.0|        0.0|       1.0|  0.0|
|        N002|  Audi|black|night|    no|        0.0|        0.0|       1.0|  1.0|
|        N003|NISSAN|black|night|   yes|        2.0|        0.0|       1.0|  0.0|
+------------+------+-----+-----+------+-----------+-----------+----------+-----+
only showing top 3 rows



In [8]:
splits = vindexed_vehicle_df.randomSplit([0.6,0.4], 42)
# optional value 42 is seed for sampling
train_df = splits[0]
test_df = splits[1]

In [9]:
nb = NaiveBayes(modelType="multinomial")

In [10]:
nbmodel = nb.fit(train_df)

In [11]:
predictions_df = nbmodel.transform(test_df)
predictions_df.show(5, True)

+------------+------+-----+-----+------+-----------+-----------+----------+-----+-------------+--------------------+--------------------+----------+
|number_plate| brand|color| time|stoled|brand_index|color_index|time_index|label|     features|       rawPrediction|         probability|prediction|
+------------+------+-----+-----+------+-----------+-----------+----------+-----+-------------+--------------------+--------------------+----------+
|        N001|   BMW|black|night|   yes|        1.0|        0.0|       1.0|  0.0|[1.0,0.0,1.0]|[-2.8415815937267...|[0.70850202429149...|       0.0|
|        N003|NISSAN|black|night|   yes|        2.0|        0.0|       1.0|  0.0|[2.0,0.0,1.0]|[-3.5347287742866...|[0.85868498527968...|       0.0|
|        N005|   BMW| blue|  day|    no|        1.0|        2.0|       0.0|  1.0|[1.0,2.0,0.0]|[-3.2470467018348...|[0.80201649862511...|       0.0|
|        N007|  VEGA|  red|night|    no|        3.0|        1.0|       1.0|  1.0|[3.0,1.0,1.0]|[-5.3264882

In [12]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
nbaccuracy = evaluator.evaluate(predictions_df)
print("Test accuracy = " + str(nbaccuracy))

Test accuracy = 0.5
