In [146]:
import pandas as pd
import numpy as np

# Data cleaning and preparation

We start by loading data from July to September 2022.

In [149]:
def load_data(month, days=31):
    full_df = pd.read_csv(f"./data/2022-{str(month)}-1.csv")
    for date in range(2, days+1):
        test_df = pd.read_csv(f'./data/2022-{str(month)}-{date}.csv')
        full_df = pd.concat([full_df, test_df])
    full_df['timestamp'] = pd.to_datetime(full_df['timestamp'])
    full_df = full_df.set_index('timestamp')
    return full_df

In [47]:
jul = load_data(7)
aug = load_data(8)
sep = load_data(9, days=30)
full_data = pd.concat([jul, aug, sep])

(5785166, 5)
(5781908, 5)
(1410593, 5)


In [57]:
full_data['available rate'] = full_data['lots_available'] / full_data['total_lots']
grped_full = full_data.groupby(full_data.carpark_number)

For each carpark, we use the first 70% as training data and last 30% as testing data. We will use previous 8*24 hours availability as features to predict the next 24 hours availability.
We slice out the data for each carpark and resample them to fill in missing values. Then we prepare training and testing features and labels.

In [79]:
# helper method to create inputs and outputs from a given dataset
def prep_train_test(dataset, feature_len):
    X = []
    Y = []
    for i in range(feature_len, len(dataset)):
        X.append(dataset['available rate'][i-feature_len:i].values)
        Y.append(dataset['available rate'][i:i+24].values)
    X = np.array(X)
    Y = np.array(Y)
    return X, Y

In [91]:
carparks = grped_full.carpark_number.unique()
trainX, trainY = [], []
testX, testY = [], []
feature_len = 24*8
split = 0.3
print("total ids:", len(carparks))
index = 0
for id in carparks:
    slice = grped_full.get_group(id[0])
    slice = slice.resample('1h').mean()
    if slice.shape[0] != 2208:
        print("Lossy data. Dropped")
        continue
    split_index = int(slice.shape[0] * split)
    X_train1, Y_train1 = prep_train_test(slice[:split_index], feature_len)
    X_test1, Y_test1 = prep_train_test(slice[split_index:], feature_len)
    trainX.extend(X_train1)
    trainY.extend(Y_train1)
    testX.extend(X_test1)
    testY.extend(Y_test1)
    index += 1
    if index % 100 == 0:
        print(index, "done")


total ids: 1966


  slice = slice.resample('1h').mean()
  Y = np.array(Y)


Lossy data. Dropped
100 done
200 done
300 done
400 done
500 done
Lossy data. Dropped
Lossy data. Dropped
600 done
700 done
800 done
900 done
1000 done
1100 done
Lossy data. Dropped
1200 done
1300 done
1400 done
1500 done
1600 done
1700 done
1800 done
1900 done


In [94]:
print("train test length:", len(trainX), len(trainY))
print("input shape:", trainX[0].shape)
print("output shape:", trainY[0].shape)

train test length: 922140 922140
input shape: (192,)
output shape: (24,)


# Model 1: Linear Regression

# Model 2: XXX

# Overall Flow

Suppose we have saved the most recent 8 days data in csv files. We can use the following code to generate a condensed csv file for recent data. The data in the condensed csv file will be used for prediction. Code below shows how to generate the condensed csv file from recent 8 days data.

In [188]:
def generate_recent_csv():
    df = load_data(11, days=8)
    df['available rate'] = df['lots_available'] / df['total_lots']
    grped = df.groupby(df.carpark_number)
    carparks = grped.carpark_number.unique()
    carparks = [carpark[0] for carpark in carparks]
    new_df = pd.DataFrame(columns=carparks)
    for id in carparks:
        slice = grped.get_group(id)
        slice = slice.resample('1h').mean()
        if slice.shape[0] != 24*8:
            print("Lossy data. Dropped")
            print(id)
            continue
        new_df[id] = slice['available rate'].values
    new_df.index = slice.index
    new_df.to_csv("./data/recent.csv")

generate_recent_csv()

  slice = slice.resample('1h').mean()


Lossy data. Dropped
Y49H


In [195]:
from queue import PriorityQueue

def find_nearest_x(x, y, location_dict):
    shortest = PriorityQueue()
    for key, loc in location_dict.items():
        dist = (loc[0] - x)**2 + (loc[1] - y)**2
        shortest.put((-dist, key))
        if shortest.qsize() > 5:
            shortest.get()
    res = [shortest.get()[1] for i in range(5)]
    res.reverse()
    return res

def create_location_dict(loc_info: pd.DataFrame):
    location_dict = {}
    for row in loc_info.iterrows():
        content = row[1]
        name = content[0]
        location = (content[2], content[3])
        location_dict[name] = location
    return location_dict

def model_predict(input):
    return input[-24:].values

In [196]:
loc_info = pd.read_csv("./data/hdb-carpark-information.csv")
recent = pd.read_csv("./data/recent.csv")
carpark_info = create_location_dict(loc_info)
top5 = find_nearest_x(30314.7936, 31490.4942, carpark_info)
for id in top5:
    if id not in recent.columns:
        print(f"one of the nearest carpark {id} is not in the recent data")
        continue
    col = recent[id][-24*8:]
    prediction = model_predict(col)
    print(f"carpark {id} prediction: {prediction}")


carpark ACB prediction: [0.64516129 0.64516129 0.64516129 0.55913978 0.5483871  0.52688172
 0.43010753 0.05376344 0.08602151 0.         0.01075269 0.07526882
 0.02150538 0.03225806 0.03225806 0.03225806 0.03225806 0.03225806
 0.03225806 0.05376344 0.05376344        nan 0.03225806 0.03225806]
carpark CY prediction: [0.35483871 0.35483871 0.35483871 0.16129032 0.09677419 0.16129032
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.                nan 0.         0.        ]
carpark WCB prediction: [0.37062937 0.42657343 0.40559441 0.3986014  0.39160839 0.37762238
 0.37762238 0.29370629 0.3006993  0.13986014 0.00699301 0.00699301
 0.00699301 0.00699301 0.         0.         0.         0.
 0.         0.00699301 0.00699301        nan 0.00699301 0.00699301]
one of the nearest carpark SR2 is not in the recent data
one of the nearest carpark SR1 is not in the recent data
