# Classification Process in Spark

In [1]:
#Import Modules and create spark Session

from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler

In [2]:
#Create Spark Session

appName = 'Classification in Spark'
spark = SparkSession.builder.appName(appName).config("spark.some.config.option","some-value").getOrCreate()

In [3]:
#Define our Schema

flightSchema = StructType([
StructField("DayofMonth",StringType(),False),
StructField("DayofWeek",StringType(),False),
StructField("Carrier",StringType(),False),
StructField("OrginalAirportID",StringType(),False),
StructField("DestAirportID",StringType(),False),
StructField("DepDelay",StringType(),False),
StructField("ArrDelay",StringType(),False)
])


In [4]:
#Read csv data with our defined schema
flightData = spark.read.csv("dataset/flights.csv",inferSchema=True,header=True)

In [5]:
flightData.show(4)

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
|        19|        5|     DL|          14057|        14869|      -4|     -15|
|        19|        5|     DL|          15016|        11433|      28|      24|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 4 rows



In [6]:
flightData.describe()

DataFrame[summary: string, DayofMonth: string, DayOfWeek: string, Carrier: string, OriginAirportID: string, DestAirportID: string, DepDelay: string, ArrDelay: string]

# Select important data for classification features and change arrival delay into binary class late vs not late. 

In [7]:
data = flightData.select("DayofMonth","DayofWeek","OriginAirportID","DestAirportID","DepDelay",((col("ArrDelay") > 15).cast("Int").alias("Late")))

In [8]:
data.show(3)

+----------+---------+---------------+-------------+--------+----+
|DayofMonth|DayofWeek|OriginAirportID|DestAirportID|DepDelay|Late|
+----------+---------+---------------+-------------+--------+----+
|        19|        5|          11433|        13303|      -3|   0|
|        19|        5|          14869|        12478|       0|   0|
|        19|        5|          14057|        14869|      -4|   0|
+----------+---------+---------------+-------------+--------+----+
only showing top 3 rows



# Divide data into training and testing data

In [9]:
#divide data, 70% for training, 30% for testing

dividedData = data.randomSplit([0.7,0.3])

trainingData = dividedData[0] #index 0 = data training

testingData = dividedData[1] #index 1 = data testing

train_rows = trainingData.count()

test_rows = testingData.count()

print("Training data rows:",train_rows,"; Testing data rows:",test_rows)


Training data rows: 734105 ; Testing data rows: 314470


# Prepare training data

In [10]:
assembler = VectorAssembler(inputCols = ["DayofMonth","DayofWeek","OriginAirportID","DestAirportID","DepDelay"], outputCol = "features")

trainingDataFinal = assembler.transform(testingData).select(col("features"),col("Late").alias("label"))

trainingDataFinal.show(truncate=False,n=2)

+------------------------------+-----+
|features                      |label|
+------------------------------+-----+
|[1.0,1.0,10140.0,10397.0,-4.0]|0    |
|[1.0,1.0,10140.0,10821.0,8.0] |0    |
+------------------------------+-----+
only showing top 2 rows



# Train our classifier model using training data

In [11]:
#define our classifier
classifier = LogisticRegression(labelCol = "label",featuresCol = "features",maxIter=10,regParam=0.3)

In [12]:
#Train our classifier

model = classifier.fit(trainingDataFinal)
print("Classifier model is trained :",model)

Classifier model is trained : LogisticRegressionModel: uid = LogisticRegression_e5b7a7371587, numClasses = 2, numFeatures = 5


In [13]:
#Prepare testing data

testingDataFinal = assembler.transform(testingData).select(col("features"),col("Late").alias("trueLabel"))

testingDataFinal.show(5)


+--------------------+---------+
|            features|trueLabel|
+--------------------+---------+
|[1.0,1.0,10140.0,...|        0|
|[1.0,1.0,10140.0,...|        0|
|[1.0,1.0,10140.0,...|        0|
|[1.0,1.0,10140.0,...|        0|
|[1.0,1.0,10140.0,...|        1|
+--------------------+---------+
only showing top 5 rows



# Predict the testing data using our trained model

In [14]:
prediction = model.transform(testingDataFinal)

predictionFinal = prediction.select("features","prediction","probability","trueLabel")


In [15]:
predictionFinal.show(truncate=False,n=3)

prediction.show(truncate=False,n=3)

+------------------------------+----------+----------------------------------------+---------+
|features                      |prediction|probability                             |trueLabel|
+------------------------------+----------+----------------------------------------+---------+
|[1.0,1.0,10140.0,10397.0,-4.0]|0.0       |[0.8273926479333787,0.17260735206662126]|0        |
|[1.0,1.0,10140.0,10821.0,8.0] |0.0       |[0.8032489861204061,0.196751013879594]  |0        |
|[1.0,1.0,10140.0,11259.0,0.0] |0.0       |[0.8200672801109562,0.17993271988904377]|0        |
+------------------------------+----------+----------------------------------------+---------+
only showing top 3 rows

+------------------------------+---------+----------------------------------------+----------------------------------------+----------+
|features                      |trueLabel|rawPrediction                           |probability                             |prediction|
+------------------------------+------

In [16]:
#Calculate our model performance

correctPrediction = predictionFinal.filter(predictionFinal['prediction'] == predictionFinal['trueLabel']).count()

totalData = predictionFinal.count()

print("correct prediction:", correctPrediction,",total data:", totalData,",accuracy:", correctPrediction/totalData)

correct prediction: 257107 ,total data: 314470 ,accuracy: 0.8175883232104811


# Try another classification algorithm provided by Spark

In [17]:
from pyspark.ml.classification import RandomForestClassifier

In [18]:
model2 = RandomForestClassifier(numTrees=3, maxDepth=5, seed=42, labelCol="label",featuresCol="features")

model2 = model2.fit(trainingDataFinal)

print("Model is trained",model2)

Model is trained RandomForestClassificationModel (uid=RandomForestClassifier_d1ddad3057da) with 3 trees


In [19]:
prediction = model2.transform(testingDataFinal)

predictionFinal = prediction.select("features","prediction","probability","trueLabel")

predictionFinal.show(truncate=False,n=3)

correctPrediction = predictionFinal.filter(predictionFinal['prediction'] == predictionFinal['trueLabel']).count()

totalData = predictionFinal.count()


+------------------------------+----------+---------------------------------------+---------+
|features                      |prediction|probability                            |trueLabel|
+------------------------------+----------+---------------------------------------+---------+
|[1.0,1.0,10140.0,10397.0,-4.0]|0.0       |[0.929469827751556,0.07053017224844393]|0        |
|[1.0,1.0,10140.0,10821.0,8.0] |0.0       |[0.929469827751556,0.07053017224844393]|0        |
|[1.0,1.0,10140.0,11259.0,0.0] |0.0       |[0.929469827751556,0.07053017224844393]|0        |
+------------------------------+----------+---------------------------------------+---------+
only showing top 3 rows



In [20]:
print("correct prediction",correctPrediction, ",total data:", totalData, ",accuracy", correctPrediction/totalData)

correct prediction 290321 ,total data: 314470 ,accuracy 0.9232073011734029
