In [1]:
# Libraries
import pandas as pd
from geopy.distance import geodesic
from datetime import timedelta
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
# Constant Variables
# Directories
DATA_DIRECTORY = 'data/'

# Filenames
CLEANED_DATA_FILENAME = 'cleaned_data.json'

# Load the Data

In [3]:
# Read cleaned_real_time_data.json
df = pd.read_json(DATA_DIRECTORY + CLEANED_DATA_FILENAME)

# Convert the time strings to datetime.time objects
df['RecordedTime'] = pd.to_datetime(df['RecordedTime'], format='%H:%M:%S').dt.time

# Convert the datatime.time objects to int
df['RecordedTime'] = df['RecordedTime'].apply(lambda x: x.hour * 3600 + x.minute * 60 + x.second)

# Set the RouteNo as string type
df['RouteNo'] = df['RouteNo'].astype(str)

In [4]:
df

Unnamed: 0,RouteNo,Direction,Latitude,Longitude,RecordedTime,RecordedDate,OnTime
0,144,NORTH,49.225150,-123.002100,25207,1712448000000,True
1,144,NORTH,49.240633,-122.968050,25993,1712448000000,True
2,144,NORTH,49.258883,-122.964050,26463,1712448000000,True
3,144,NORTH,49.258900,-122.964050,26523,1712448000000,True
4,144,SOUTH,49.243283,-122.973483,27580,1712448000000,True
...,...,...,...,...,...,...,...
8713,R5,WEST,49.278533,-122.912733,82524,1712707200000,False
8714,R5,WEST,49.281400,-123.099283,82513,1712707200000,False
8715,R5,WEST,49.281200,-123.025350,82650,1712707200000,True
8716,R5,WEST,49.281183,-123.025333,82709,1712707200000,True


# Train the Model

In [5]:
# Convert the strings to numerical values
label_encoder = LabelEncoder()
df['RouteNo'] = label_encoder.fit_transform(df['RouteNo'])
df['Direction'] = label_encoder.fit_transform(df['Direction'])
df

Unnamed: 0,RouteNo,Direction,Latitude,Longitude,RecordedTime,RecordedDate,OnTime
0,0,1,49.225150,-123.002100,25207,1712448000000,True
1,0,1,49.240633,-122.968050,25993,1712448000000,True
2,0,1,49.258883,-122.964050,26463,1712448000000,True
3,0,1,49.258900,-122.964050,26523,1712448000000,True
4,0,2,49.243283,-122.973483,27580,1712448000000,True
...,...,...,...,...,...,...,...
8713,2,3,49.278533,-122.912733,82524,1712707200000,False
8714,2,3,49.281400,-123.099283,82513,1712707200000,False
8715,2,3,49.281200,-123.025350,82650,1712707200000,True
8716,2,3,49.281183,-123.025333,82709,1712707200000,True


In [6]:
# Organize the input data and the output data
X = df[['RouteNo', 'Direction', 'Latitude', 'Longitude', 'RecordedTime', 'RecordedDate']]
y = df['OnTime']

In [7]:
# Split the training data and the testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Train the model
# Random Forest
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

# MLP Classifier
mlp_classifier = MLPClassifier(random_state=42, max_iter=300)
mlp_classifier.fit(X_train, y_train)

# Gaussian Naive Bayes
gnb_classifier = GaussianNB()
gnb_classifier.fit(X_train, y_train)

In [9]:
# Evaluation for Random Forest
y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Random Forest Accuracy:", accuracy)
print("Random Forest Precision:", precision)
print("Random Forest Recall:", recall)
print("Random Forest F1 Score:", f1)

# Evaluation for MLP
y_pred_mlp = mlp_classifier.predict(X_test)

accuracy_mlp = accuracy_score(y_test, y_pred_mlp)
precision_mlp = precision_score(y_test, y_pred_mlp)
recall_mlp = recall_score(y_test, y_pred_mlp)
f1_mlp = f1_score(y_test, y_pred_mlp)

print("MLP Accuracy:", accuracy_mlp)
print("MLP Precision:", precision_mlp)
print("MLP Recall:", recall_mlp)
print("MLP F1 Score:", f1_mlp)

# Evaluation for Gaussian
y_pred_gnb = gnb_classifier.predict(X_test)

accuracy_gnb = accuracy_score(y_test, y_pred_gnb)
precision_gnb = precision_score(y_test, y_pred_gnb)
recall_gnb = recall_score(y_test, y_pred_gnb)
f1_gnb = f1_score(y_test, y_pred_gnb)

print("GaussianNB Accuracy:", accuracy_gnb)
print("GaussianNB Precision:", precision_gnb)
print("GaussianNB Recall:", recall_gnb)
print("GaussianNB F1 Score:", f1_gnb)

Random Forest Accuracy: 0.7454128440366973
Random Forest Precision: 0.780466724286949
Random Forest Recall: 0.8261665141811528
Random Forest F1 Score: 0.8026666666666668
MLP Accuracy: 0.6267201834862385
MLP Precision: 0.6267201834862385
MLP Recall: 1.0
MLP F1 Score: 0.7705322523792739
GaussianNB Accuracy: 0.6267201834862385
GaussianNB Precision: 0.6267201834862385
GaussianNB Recall: 1.0
GaussianNB F1 Score: 0.7705322523792739
