<h1>Machine Learning in Big Data

<h2>1 Data Loading and exploration</h2>
<h3>1.1 Data Loading

In [None]:
from pyspark import SparkConf
from pyspark import SparkContext # Spark
from pyspark.sql import SparkSession # Spark SQL

# local[*]: run Spark in local mode with as many working processors as logical cores on my machine
master = "local[*]"
# Set the appname
app_name = "Building models to predict pedestrian traffic"
# Setup configuration parameters for Spark
spark_conf = SparkConf().setMaster(master).setAppName(app_name).set("spark.sql.session.timeZone", "UTC")
print(spark_conf.toDebugString())

# Using SparkSession
spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('ERROR')

In [2]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, FloatType, TimestampType

# defining the Monthly counts data schema
montly_counts_schema = StructType([StructField("ID",IntegerType(),False), 
                                   StructField("Date_Time", StringType(),False), 
                                   StructField("Year", IntegerType(),True), 
                                   StructField("Month", StringType(),True), 
                                   StructField("Mdate", IntegerType(),True), 
                                   StructField("Day", StringType(),True), 
                                   StructField("Time", IntegerType(),True),
                                   StructField("Sensor_ID", IntegerType(),True),
                                   StructField("Sensor_Name", StringType(),True),
                                   StructField("Hourly_Counts", IntegerType(),True)])

In [3]:
from pyspark.sql.functions import *
# function to read csv into df with a given schema
def readCsvToDf(filename:str, schema):
    df = spark.read.format("csv").option("header", "true").schema(schema).load(filename)
    df = df.withColumn("Date_Time",to_timestamp(col("Date_Time"), "MM/dd/yyyy hh:mm:ss a").alias("Date_Time"))
    df = df.withColumn("Date_Time",col("Date_Time").cast(TimestampType()))
    df.printSchema()
    return df

# load pedestrian count CSV file into a dataframe
ped_counts_df = readCsvToDf("Pedestrian_Counting_System_-_Monthly__counts_per_hour.csv", montly_counts_schema)

root
 |-- ID: integer (nullable = true)
 |-- Date_Time: timestamp (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: string (nullable = true)
 |-- Mdate: integer (nullable = true)
 |-- Day: string (nullable = true)
 |-- Time: integer (nullable = true)
 |-- Sensor_ID: integer (nullable = true)
 |-- Sensor_Name: string (nullable = true)
 |-- Hourly_Counts: integer (nullable = true)



In [4]:
from pyspark.sql.functions import when, col

ped_counts_df = ped_counts_df.withColumn('above_threshold', when(col('Hourly_Counts') >= 2000, 1.0).otherwise(0.0))
ped_counts_df.printSchema()
ped_counts_df.show(11)

root
 |-- ID: integer (nullable = true)
 |-- Date_Time: timestamp (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: string (nullable = true)
 |-- Mdate: integer (nullable = true)
 |-- Day: string (nullable = true)
 |-- Time: integer (nullable = true)
 |-- Sensor_ID: integer (nullable = true)
 |-- Sensor_Name: string (nullable = true)
 |-- Hourly_Counts: integer (nullable = true)
 |-- above_threshold: double (nullable = false)

+-------+-------------------+----+--------+-----+------+----+---------+--------------------+-------------+---------------+
|     ID|          Date_Time|Year|   Month|Mdate|   Day|Time|Sensor_ID|         Sensor_Name|Hourly_Counts|above_threshold|
+-------+-------------------+----+--------+-----+------+----+---------+--------------------+-------------+---------------+
|2887628|2019-11-01 17:00:00|2019|November|    1|Friday|  17|       34|Flinders St-Spark La|          300|            0.0|
|2887629|2019-11-01 17:00:00|2019|November|    1|Friday|  17

<h3>Exploring the data

In [5]:
# concatenate the arrays within an array to just an array within an array
def concatenate_array(array):
    for i in range(1, len(array)):
        array[0] += array[i]
    return array[0]

columns_without_thres_datetime = ["ID", "Year", "Mdate", "Time", "Sensor_ID", "Hourly_Counts"]
# Get the count mean, stddev, min and max
stats_df = ped_counts_df.select(columns_without_thres_datetime).describe()

quantiles = [0.25,0.50,0.75]
all_quantiles = []

# get the 25,50,75 percentiles
for i in range(len(quantiles)):
    quantile = quantiles[i]
    # get the quantile
    quantile_stats = ped_counts_df.approxQuantile(columns_without_thres_datetime, [quantile], 0.25)
    quantile_stats = concatenate_array(quantile_stats)
    # create summary title
    title = str(quantile)[2:]
    if len(title) < 2:
        title += "0"
    quantile_stats.insert(0, title+"%")
    all_quantiles.append(quantile_stats)

# create data frame rows of quantiles values
quantiles_df = spark.createDataFrame(all_quantiles)

# merge the percentiles rows with the stats dataframe
stats_df = stats_df.union(quantiles_df)
stats_df.show()

+-------+-----------------+------------------+------------------+------------------+------------------+-----------------+
|summary|               ID|              Year|             Mdate|              Time|         Sensor_ID|    Hourly_Counts|
+-------+-----------------+------------------+------------------+------------------+------------------+-----------------+
|  count|          3435106|           3435106|           3435106|           3435106|           3435106|          3435106|
|   mean|        1717553.5|2016.0032330880038|15.751918863639142|11.459955238644746|22.978422791028866|560.7805942524044|
| stddev|991629.8312350252|3.1237869143646275|  8.79918757461428| 6.943473866829414|16.229792156265397|809.9942576353371|
|    min|                1|              2009|                 1|                 0|                 1|                0|
|    max|          3435106|              2020|                31|                23|                71|            15979|
|    25%|              1

In [6]:
ped_counts_df.groupby("above_threshold").count().show()

+---------------+-------+
|above_threshold|  count|
+---------------+-------+
|            0.0|3184164|
|            1.0| 250942|
+---------------+-------+



<h2>2. Feature extraction and ML training
<h3>2.1 Preparing Spark ML Transformers/Estimators for features, labels and models 

In [7]:
# create udf to convert Months: January, February, March... to 1,2,3...
def month_to_integer(s):
    if s=='January':
        return 1
    elif s=='February':
        return 2
    elif s=='March':
        return 3
    elif s=='April':
        return 4
    elif s=='May':
        return 5
    elif s=='June':
        return 6
    elif s=='July':
        return 7
    elif s=='August':
        return 8
    elif s=='September':
        return 9
    elif s=='October':
        return 10
    elif s=='November':
        return 11
    else:
        return 12

month_to_integer_udf = udf(month_to_integer,IntegerType())

# create a UDF to convert the Mondays, Tuesdays ... to 1,2....  
def day_to_integer(s):
    if s=='Monday':
        return 1
    elif s=='Tuesday':
        return 2
    elif s=='Wednesday':
        return 3
    elif s=='Thursday':
        return 4
    elif s=='Friday':
        return 5
    elif s=='Saturday':
        return 6
    else:
        return 7

day_to_integer_udf = udf(day_to_integer,IntegerType())

# Add columns Month (in integer), day_of_week and week_of_year
new_df = ped_counts_df.withColumn("Month_Integer",month_to_integer_udf(ped_counts_df['Month']))
new_df = new_df.withColumn("day_of_week", day_to_integer_udf(new_df['Day']))
new_df = new_df.withColumn("week_of_year", weekofyear(new_df['Date_Time']))

# filter data between 2014 to 2019 and only for hours between 9am to 11pm
new_df = new_df.filter((new_df['Year'] >= 2014) & (new_df['Year'] <= 2019) & (new_df['Time'] >= 9) & (new_df['Time'] <= 23))

# add label column for ml models' column later on
new_df = new_df.withColumn("label", new_df["above_threshold"])
new_df.show(100)

+-------+-------------------+----+--------+-----+------+----+---------+--------------------+-------------+---------------+-------------+-----------+------------+-----+
|     ID|          Date_Time|Year|   Month|Mdate|   Day|Time|Sensor_ID|         Sensor_Name|Hourly_Counts|above_threshold|Month_Integer|day_of_week|week_of_year|label|
+-------+-------------------+----+--------+-----+------+----+---------+--------------------+-------------+---------------+-------------+-----------+------------+-----+
|2887628|2019-11-01 17:00:00|2019|November|    1|Friday|  17|       34|Flinders St-Spark La|          300|            0.0|           11|          5|          44|  0.0|
|2887629|2019-11-01 17:00:00|2019|November|    1|Friday|  17|       39|        Alfred Place|          604|            0.0|           11|          5|          44|  0.0|
|2887630|2019-11-01 17:00:00|2019|November|    1|Friday|  17|       37|     Lygon St (East)|          216|            0.0|           11|          5|          44

In [8]:
from pyspark.ml.feature import OneHotEncoder, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, LogisticRegression

# Setting up the one hot encoder
inputCols = ['Month_Integer', 'Mdate', 'week_of_year', 'day_of_week', 'Time', 'Sensor_ID']
outputCols = [f'{x}_vec' for x in inputCols]
encoder = OneHotEncoder(inputCols=inputCols,outputCols=outputCols, handleInvalid='keep')

# Setting up the Vector Assembler
inputColsAssembler = outputCols
assembler = VectorAssembler(inputCols = inputColsAssembler, outputCol="features")

# Setting up Logistic Regression Model
logistic_regression = LogisticRegression(labelCol='above_threshold', featuresCol='features')

# Setting up Decision Tree 
decision_tree = DecisionTreeClassifier(labelCol='above_threshold', featuresCol = 'features')

# Setting up Random Forest
random_forest = RandomForestClassifier(labelCol='above_threshold', featuresCol='features')

In [9]:
from pyspark.ml import Pipeline

lr_pipeline = Pipeline(stages=[encoder,assembler,logistic_regression])
dt_pipeline = Pipeline(stages=[encoder,assembler,decision_tree])
rf_pipeline = Pipeline(stages=[encoder,assembler,random_forest])

<h3>2.2 Preparing the training data and testing data</h3>   

In [10]:
# Filter data from 2014 to 2018 as training data
train_df = new_df.filter((new_df['Year'] >= 2014) & (new_df['Year'] <= 2018))
# Filter data from 2019 for testing purposes
test_df = new_df.filter(new_df['Year'] == 2019)

<h3>2.3 Training and evaluating models</h3>

In [11]:
# Train and test logistic regression
lr_model = lr_pipeline.fit(train_df)
lr_predictions = lr_model.transform(test_df)
lr_predictions.select('features','above_threshold','prediction').show()

# Train and test decision tree
dt_model = dt_pipeline.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_predictions.select('features','above_threshold','prediction').show()

# Train and test logistic regression
rf_model = rf_pipeline.fit(train_df)
rf_predictions = rf_model.transform(test_df)
rf_predictions.select('features','above_threshold','prediction').show()

+--------------------+---------------+----------+
|            features|above_threshold|prediction|
+--------------------+---------------+----------+
|(190,[11,14,89,10...|            0.0|       0.0|
|(190,[11,14,89,10...|            0.0|       0.0|
|(190,[11,14,89,10...|            0.0|       0.0|
|(190,[11,14,89,10...|            0.0|       0.0|
|(190,[11,14,89,10...|            0.0|       0.0|
|(190,[11,14,89,10...|            0.0|       0.0|
|(190,[11,14,89,10...|            0.0|       0.0|
|(190,[11,14,89,10...|            0.0|       0.0|
|(190,[11,14,89,10...|            0.0|       0.0|
|(190,[11,14,89,10...|            1.0|       1.0|
|(190,[11,14,89,10...|            1.0|       1.0|
|(190,[11,14,89,10...|            0.0|       0.0|
|(190,[11,14,89,10...|            1.0|       1.0|
|(190,[11,14,89,10...|            0.0|       0.0|
|(190,[11,14,89,10...|            0.0|       0.0|
|(190,[11,14,89,10...|            0.0|       0.0|
|(190,[11,14,89,10...|            0.0|       0.0|


In [12]:
# function to compute the metrics of the predictions
def compute_metrics(predictions):
    values_df = predictions.groupBy("above_threshold","prediction").count()
    values_df.show()
    try:
        TP = values_df.filter((values_df['above_threshold'] == 1.0)&(values_df['prediction']==1.0)).select(values_df['count']).collect()[0][0]
    except:
        TP = 0
    try:
        TN = values_df.filter((values_df['above_threshold'] == 0.0)&(values_df['prediction']==0.0)).select(values_df['count']).collect()[0][0]
    except:
        TN = 0
    try:
        FP = values_df.filter((values_df['above_threshold'] == 0.0)&(values_df['prediction']==1.0)).select(values_df['count']).collect()[0][0]
    except:
        FP = 0
    try:
        FN = values_df.filter((values_df['above_threshold'] == 1.0)&(values_df['prediction']==0.0)).select(values_df['count']).collect()[0][0]
    except:
        FN = 0
        
    # calculate metrics by the confusion matrix
    accuracy = (TP+TN) / (TP+TN+FN+FP)    # formula to find accuracy
    try:
        precision = TP / (TP+FP)              # formula to find precision
    except ZeroDivisionError:
        precision = 'undefined'
    recall = TP / (TP+FN)                 # formula to find recall
    
    print("Accuracy: "+str(accuracy))
    print("Precision: "+str(precision))
    print("Recall: "+str(recall))
    
    return accuracy,precision,recall
    
# # Logistic Model 
print("Logistic Regression Model: ")
metrics_calc_lr_df = lr_predictions.select('features', 'above_threshold','prediction')
accuracy,precision,recall = compute_metrics(metrics_calc_lr_df)

# Decision Tree Model
print("\n\nDecision Tree Model: ")
metrics_calc_dt_df = dt_predictions.select('features', 'above_threshold','prediction')
accuracy,precision,recall = compute_metrics(metrics_calc_dt_df)

# # Random Tree Model
print("\n\nRandom Tree Model: ")
metrics_calc_rf_df = rf_predictions.select('features', 'above_threshold','prediction')
accuracy,precision,recall = compute_metrics(metrics_calc_rf_df)

Logistic Regression Model: 
+---------------+----------+------+
|above_threshold|prediction| count|
+---------------+----------+------+
|            1.0|       1.0| 19728|
|            0.0|       1.0|  6453|
|            1.0|       0.0| 11313|
|            0.0|       0.0|248038|
+---------------+----------+------+

Accuracy: 0.9377793031954387
Precision: 0.7535235476108628
Recall: 0.6355465352276022


Decision Tree Model: 
+---------------+----------+------+
|above_threshold|prediction| count|
+---------------+----------+------+
|            1.0|       1.0|  7683|
|            0.0|       1.0|  6845|
|            1.0|       0.0| 23358|
|            0.0|       0.0|247646|
+---------------+----------+------+

Accuracy: 0.8942220136447053
Precision: 0.528840859030837
Recall: 0.24751135594858414


Random Tree Model: 
+---------------+----------+------+
|above_threshold|prediction| count|
+---------------+----------+------+
|            1.0|       0.0| 31041|
|            0.0|       0.0|2544

 
<b>In terms of accuracy, Logistic Regression is a few percent better than the Decision Tree and Random Tree model which both have essentially the same accuracy. In terms of precision and recall, Logistic Regression is much better than the other two. The Decision Tree model suffers from low precision and recall. On the other hand, due to Random Tree model predicting everything as negative, its precision and recall is undefined and 0 respectively. This could be mainly attributed to the data imbalance of above and below threshold values we noticed before. 

Based on the performance metrics, it is clear that the Logistic Regression model is much better than the other two models as all three values of accuracy, precision and recall are higher than the other two. Therefore, we will persist with the Logistic Regression model.

In [None]:
# Persist the pipeline model
lr_fitted_mode.write().overwrite().save("pedestrian_prediction_model")

Print out/visualize the tree structure, and get the top-3 features with the corresponding feature importance

In [13]:
# Print the if-else tree structure
decision_tree_model = dt_model.stages[2]
print(decision_tree_model.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9adbe19f8eb1, depth=5, numNodes=49, numClasses=2, numFeatures=190
  If (feature 169 in {1.0})
   If (feature 130 in {1.0})
    If (feature 104 in {1.0})
     If (feature 98 in {1.0})
      Predict: 0.0
     Else (feature 98 not in {1.0})
      If (feature 46 in {1.0})
       Predict: 0.0
      Else (feature 46 not in {1.0})
       Predict: 1.0
    Else (feature 104 not in {1.0})
     If (feature 105 in {0.0})
      If (feature 49 in {1.0})
       Predict: 1.0
      Else (feature 49 not in {1.0})
       Predict: 0.0
     Else (feature 105 not in {0.0})
      If (feature 46 in {1.0})
       Predict: 0.0
      Else (feature 46 not in {1.0})
       Predict: 1.0
   Else (feature 130 not in {1.0})
    If (feature 129 in {1.0})
     If (feature 104 in {1.0})
      If (feature 98 in {1.0})
       Predict: 0.0
      Else (feature 98 not in {1.0})
       Predict: 1.0
     Else (feature 104 not in {1.0})
      If (feature 105 in {1.0})
 

In [14]:
import pandas as pd
# Function taken from lab 6: Used to extract the features and map it to their feature columns
def ExtractFeatureImp(featureImp, dataset, featuresCol):
    list_extract = []
    for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score', ascending = False))

# get the decision tree from the pipelined model
decision_tree = dt_model.stages[2]
# Print the top-3 features with each corresponding feature importance(represented by name and score respectively in the table below)
ExtractFeatureImp(decision_tree.featureImportances, dt_predictions, "features").head(3)

Unnamed: 0,idx,name,score
169,169,Sensor_ID_vec_38,0.218201
135,135,Sensor_ID_vec_4,0.179074
134,134,Sensor_ID_vec_3,0.167338
