# Importing machine learning libraries and packages

In [1]:
import findspark
findspark.init()
import pyspark

In [11]:
# import modules
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

# create Spark session
appName = "Classification in Apache Spark"
spark = SparkSession \
    .builder \
    .appName(appName) \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

# Read the dataset file into Spark

In [15]:
# define our schema
flightSchema = StructType([
  StructField("DayofMonth", IntegerType(), False),
  StructField("DayOfWeek", IntegerType(), False),
  StructField("Carrier", StringType(), False),
  StructField("OriginAirportID", IntegerType(), False),
  StructField("DestAirportID", IntegerType(), False),
  StructField("DepDelay", IntegerType(), False),
  StructField("ArrDelay", IntegerType(), False),
])

# read csv data with our defined schema
csv = spark.read.csv('flights.csv', schema=flightSchema, header=True)
csv.show(3)

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
|        19|        5|     DL|          14057|        14869|      -4|     -15|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 3 rows



# Handle missing data

In [17]:
# Hapus baris jika ada setidaknya satu kolom kosong
csv2 = csv.dropna(how="any", subset=["DayofMonth","DayOfWeek","Carrier","OriginAirportID",
                       "DestAirportID","ArrDelay", "DepDelay"])

# Select the data feature and convert the ArrDelay column to binary

In [22]:
data = csv2.select(
    "DayofMonth", "DayOfWeek", "OriginAirportID", "DestAirportID", ((col("ArrDelay") > 15).cast("Int").alias("Late")))
data.show(3)

+----------+---------+---------------+-------------+----+
|DayofMonth|DayOfWeek|OriginAirportID|DestAirportID|Late|
+----------+---------+---------------+-------------+----+
|        19|        5|          11433|        13303|   0|
|        19|        5|          14869|        12478|   0|
|        19|        5|          14057|        14869|   0|
+----------+---------+---------------+-------------+----+
only showing top 3 rows



# Split into training data and testing data

In [23]:
# divide data, 70% for training, 30% for testing
dividedData = data.randomSplit([0.7, 0.3]) 
trainingData = dividedData[0] #index 0 = data training
testingData = dividedData[1] #index 1 = data testing
train_rows = trainingData.count()
test_rows = testingData.count()
print("Training data rows:", train_rows, "; Testing data rows:", test_rows)

Training data rows: 1892623 ; Testing data rows: 809595


# Prepare training data

In [24]:
# define an assembler
assembler = VectorAssembler(inputCols = [
    "DayofMonth", "DayOfWeek", "OriginAirportID", "DestAirportID"], outputCol="features")
trainingDataFinal = assembler.transform(
    trainingData).select(col("features"), col("Late").alias("label"))
trainingDataFinal.show(truncate=False, n=2)

+-------------------------+-----+
|features                 |label|
+-------------------------+-----+
|[1.0,1.0,10140.0,10397.0]|0    |
|[1.0,1.0,10140.0,10397.0]|0    |
+-------------------------+-----+
only showing top 2 rows



# Train the model with training data

In [25]:
# define our classifier
classifier = NaiveBayes(labelCol="label", featuresCol="features", smoothing=1.0, modelType="multinomial")

# train our classifier
model = classifier.fit(trainingDataFinal)
print("Model berhasil dilatih!")

Model berhasil dilatih!


# Prepare testing data

In [26]:
testingDataFinal = assembler.transform(
    testingData).select(col("features"), col("Late").alias("trueLabel"))
testingDataFinal.show(3)

+--------------------+---------+
|            features|trueLabel|
+--------------------+---------+
|[1.0,1.0,10140.0,...|        0|
|[1.0,1.0,10140.0,...|        0|
|[1.0,1.0,10140.0,...|        0|
+--------------------+---------+
only showing top 3 rows



# Predict testing data using a trained model

In [27]:
prediction = model.transform(testingDataFinal)
predictionFinal = prediction.select(
    "features", "prediction", "probability", "trueLabel")
predictionFinal.show(truncate=False, n=3)
prediction.show(truncate=False, n=3)

+-------------------------+----------+---------------------------------------+---------+
|features                 |prediction|probability                            |trueLabel|
+-------------------------+----------+---------------------------------------+---------+
|[1.0,1.0,10140.0,10397.0]|0.0       |[0.703935976301514,0.29606402369848595]|0        |
|[1.0,1.0,10140.0,10397.0]|0.0       |[0.703935976301514,0.29606402369848595]|0        |
|[1.0,1.0,10140.0,10821.0]|1.0       |[0.42633192467924,0.5736680753207599]  |0        |
+-------------------------+----------+---------------------------------------+---------+
only showing top 3 rows

+-------------------------+---------+----------------------------------------+---------------------------------------+----------+
|features                 |trueLabel|rawPrediction                           |probability                            |prediction|
+-------------------------+---------+----------------------------------------+--------------

# Calculate model performance or accuracy

In [29]:
correctPrediction = predictionFinal.filter(
    predictionFinal['prediction'] == predictionFinal['trueLabel']).count()
totalData = predictionFinal.count()
print("correct prediction:", correctPrediction, ", total data:", totalData, 
      ", accuracy:", correctPrediction/totalData)
correctPrediction = predictionFinal.filter(
    predictionFinal['prediction'] == predictionFinal['trueLabel']).count()
totalData = predictionFinal.count()
print("correct prediction:", correctPrediction, ", total data:", totalData, 
      ", accuracy:", correctPrediction/totalData)

correct prediction: 452122 , total data: 809595 , accuracy: 0.5584545359099303
correct prediction: 452122 , total data: 809595 , accuracy: 0.5584545359099303
