In [1]:
# Libraries
import pandas as pd
from geopy.distance import geodesic
from datetime import timedelta
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

In [2]:
# Constant Variables
# Directories
DATA_DIRECTORY = 'data/'

# Filenames
CLEANED_DATA_FILENAME = 'cleaned_data.json'

# Load the Data

In [3]:
# Read cleaned_real_time_data.json
df = pd.read_json(DATA_DIRECTORY + CLEANED_DATA_FILENAME)

# Convert the time strings to datetime.time objects
df['RecordedTime'] = pd.to_datetime(df['RecordedTime'], format='%H:%M:%S').dt.time

# Convert the datatime.time objects to int
df['RecordedTime'] = df['RecordedTime'].apply(lambda x: x.hour * 3600 + x.minute * 60 + x.second)

# Set the RouteNo as string type
df['RouteNo'] = df['RouteNo'].astype(str)

In [4]:
df

Unnamed: 0,RouteNo,Direction,Latitude,Longitude,RecordedTime,RecordedDate,OnTime
0,144,NORTH,49.225150,-123.002100,25207,1712448000000,True
1,144,NORTH,49.225700,-122.994100,25403,1712448000000,True
2,144,NORTH,49.229850,-122.992800,25485,1712448000000,True
3,144,NORTH,49.229650,-122.987017,25543,1712448000000,True
4,144,NORTH,49.229450,-122.975800,25603,1712448000000,True
...,...,...,...,...,...,...,...
68452,R5,WEST,49.280133,-122.920233,82761,1712707200000,True
68453,R5,EAST,49.280367,-122.956417,82766,1712707200000,True
68454,R5,WEST,49.281200,-123.025333,82770,1712707200000,True
68455,R5,EAST,49.281083,-123.056533,82755,1712707200000,True


# Train the Model

In [5]:
# Convert the strings to numerical values
label_encoder = LabelEncoder()
df['RouteNo'] = label_encoder.fit_transform(df['RouteNo'])
df['Direction'] = label_encoder.fit_transform(df['Direction'])
df

Unnamed: 0,RouteNo,Direction,Latitude,Longitude,RecordedTime,RecordedDate,OnTime
0,0,1,49.225150,-123.002100,25207,1712448000000,True
1,0,1,49.225700,-122.994100,25403,1712448000000,True
2,0,1,49.229850,-122.992800,25485,1712448000000,True
3,0,1,49.229650,-122.987017,25543,1712448000000,True
4,0,1,49.229450,-122.975800,25603,1712448000000,True
...,...,...,...,...,...,...,...
68452,2,3,49.280133,-122.920233,82761,1712707200000,True
68453,2,0,49.280367,-122.956417,82766,1712707200000,True
68454,2,3,49.281200,-123.025333,82770,1712707200000,True
68455,2,0,49.281083,-123.056533,82755,1712707200000,True


In [6]:
# Organize the input data and the output data
X = df[['RouteNo', 'Direction', 'Latitude', 'Longitude', 'RecordedTime', 'RecordedDate']]
y = df['OnTime']

In [7]:
# Split the training data and the testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Train the model
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

In [9]:
# Evaluation
y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.9864884604148407
Precision: 0.9891967369736165
Recall: 0.9971847681137946
F1 Score: 0.9931746910164176
