### In this document, we will point out the experimental mistakes in the previous work of :  Rodrigues, Filipe, Ioulia Markou, and Francisco C. Pereira. "Combining time-series and textual data for taxi demand prediction in event areas: A deep learning approach." Information Fusion 49 (2019): 120-129.

### This code is copied from their previous experiment:  https://github.com/fmpr/Combining-TimeSeries-TextData/blob/master/barclays_fc/run_experiments.py

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import numpy as np
import pandas as pd
import keras
import tensorflow as tf
from datetime import datetime
from sklearn import datasets, linear_model
from matplotlib import pyplot as plt
from matplotlib import cm
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.layers import BatchNormalization, Input, Embedding, Concatenate, Conv1D, MaxPooling1D, Flatten
from keras.layers import merge, Concatenate, Permute, RepeatVector, Reshape
from keras.models import Sequential, Model
import keras.backend as K
import statsmodels.formula.api as smf
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# prevent tensorflow from allocating the entire GPU memory at once
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)


# ---------------------------------------- GLOBAL PARAMETERS

NUM_LAGS = 10
sel = [0,2,4,5,7,8,9] # weather features to use

# word embeddings parameters

MAX_SEQUENCE_LENGTH = 600 #1000
MAX_NB_WORDS = 5000 #20000
EMBEDDING_DIM = 300 #300


# ---------------------------------------- Load weather data
print("loading weather data...")

# load data
df = pd.read_csv("D:/pycharmworkspace/GNN/release_code_and_datasets/central_park_weather.csv")
df = df.set_index("date")

# replace predefined values with NaN
df = df.replace(99.99, np.nan)
df = df.replace(999.9, np.nan)
df = df.replace(9999.9, np.nan)

# replace NaN with 0 for snow depth
df["snow_depth"] = df["snow_depth"].fillna(0)

# do interpolation for the remaining NaNs
df = df.interpolate()

# standardize data
removed_mean = df.mean()
removed_std = df.std()
weather = (df - removed_mean) / removed_std


# ---------------------------------------- Load events data
print("loading events data...")

events = pd.read_csv("D:/pycharmworkspace/GNN/release_code_and_datasets/barclays_events_preprocessed.tsv", sep="\t")
events.head()

events['start_time'] = pd.to_datetime(events['start_time'], format='%Y-%m-%d %H:%M')
events['date'] = events['start_time'].dt.strftime("%Y-%m-%d")
events = events[["date","start_time","title","url","description"]]


# ---------------------------------------- Load taxi data (and merge with others and detrend)
print("loading taxi data (and merging and detrending)...")

df = pd.read_csv("D:/pycharmworkspace/GNN/release_code_and_datasets/pickups_barclays_center_0.003.csv")

df_sum = pd.DataFrame(df.groupby("date")["pickups"].sum())
df_sum["date"] = df_sum.index
df_sum.index = pd.to_datetime(df_sum.index, format='%Y-%m-%d %H:%M')
df_sum["dow"] = df_sum.index.weekday

# add events information
event_col = np.zeros((len(df_sum)))
late_event = np.zeros((len(df_sum)))
really_late_event = np.zeros((len(df_sum)))
event_desc_col = []
for i in range(len(df_sum)):
    if df_sum.iloc[i].date in events["date"].values:
        event_col[i] = 1
        event_descr = ""
        for e in events[events.date == df_sum.iloc[i].date]["description"]:
            event_descr += str(e) + " "
        event_desc_col.append(event_descr)
        for e in events[events.date == df_sum.iloc[i].date]["start_time"]:
            if e.hour >= 20:
                late_event[i] = 1
            if e.hour >= 21:
                really_late_event[i] = 1
    else:
        event_desc_col.append("None")

df_sum["event"] = event_col
df_sum["late_event"] = late_event
df_sum["really_late_event"] = really_late_event
df_sum["event_desc"] = event_desc_col
df_sum["event_next_day"] = pd.Series(df_sum["event"]).shift(-1)
df_sum["late_event_next_day"] = pd.Series(df_sum["late_event"]).shift(-1)
df_sum["really_late_event_next_day"] = pd.Series(df_sum["really_late_event"]).shift(-1)
df_sum["event_next_day_desc"] = pd.Series(df_sum["event_desc"]).shift(-1)

# merge with weather data
df_sum = df_sum.join(weather, how='inner')
df_sum.head()

# keep only data after 2013
START_YEAR = 2013
df_sum = df_sum.loc[df_sum.index.year >= START_YEAR]
df_sum.head()

df_sum["year"] = df_sum.index.year

trend_mean = df_sum[df_sum.index.year < 2015].groupby(["dow"]).mean()["pickups"]

#trend_std = df_sum.groupby(["year"]).std()["pickups"]
trend_std = df_sum["pickups"].std()

# build vectors with trend to remove and std
trend = []
std = []
for ix, row in df_sum.iterrows():
    trend.append(trend_mean[row.dow])
    #std.append(trend_std[row.year])
    std.append(trend_std)

df_sum["trend"] = trend
df_sum["std"] = std

# detrend data
df_sum["detrended"] = (df_sum["pickups"] - df_sum["trend"]) / df_sum["std"]


# ---------------------------------------- Build lags and features
print("building lags...")

lags = pd.concat([pd.Series(df_sum["detrended"]).shift(x) for x in range(0,NUM_LAGS)],axis=1).as_matrix()
event_feats = np.concatenate([df_sum["event_next_day"].as_matrix()[:,np.newaxis],
                             df_sum["late_event"].as_matrix()[:,np.newaxis],
                             #df_sum["late_event_next_day"].as_matrix()[:,np.newaxis],
                             df_sum["really_late_event"].as_matrix()[:,np.newaxis],
                             df_sum["really_late_event_next_day"].as_matrix()[:,np.newaxis]], axis=1)
lags_event_feats = pd.concat([pd.Series(df_sum["event_next_day"]).shift(x) for x in range(0,NUM_LAGS)],axis=1).as_matrix()
event_texts = df_sum["event_next_day_desc"].as_matrix()
weather_feats = df_sum[['min_temp', 'max_temp', 'wind_speed',
       'wind_gust', 'visibility', 'pressure', 'precipitation',
       'snow_depth', 'fog', 'rain_drizzle', 'snow_ice', 'thunder']].as_matrix()
preds = pd.Series(df_sum["detrended"]).shift(-1).as_matrix()
trends = df_sum["trend"].as_matrix()
stds = df_sum["std"].as_matrix()

lags = lags[NUM_LAGS:-1,:]
event_feats = event_feats[NUM_LAGS:-1,:]
lags_event_feats = lags_event_feats[NUM_LAGS:-1,:]
event_texts = event_texts[NUM_LAGS:-1]
weather_feats = weather_feats[NUM_LAGS:-1,:]
preds = preds[NUM_LAGS:-1]
trends = trends[NUM_LAGS:-1]
stds = stds[NUM_LAGS:-1]

Using TensorFlow backend.


loading weather data...
loading events data...
loading taxi data (and merging and detrending)...
building lags...




In [2]:
# ---------------------------------------- Train/test split
print("loading train/val/test split...")

i_train = 365*2-90 # 2013 and 2014
i_val = 365*2
i_test = -1 # 2015 and 2016 (everything else)

lags_train = lags[:i_train,:] # time series lags
event_feats_train = event_feats[:i_train,:] # event/no_event
lags_event_feats_train = lags_event_feats[:i_train,:] # lags for event/no_event
event_texts_train = event_texts[:i_train] # event text descriptions
weather_feats_train = weather_feats[:i_train,:] # weather data
y_train = preds[:i_train] # target values

lags_val = lags[i_train:i_val,:] # time series lags
event_feats_val = event_feats[i_train:i_val,:] # event/no_event
lags_event_feats_val = lags_event_feats[i_train:i_val,:] # lags for event/no_event
event_texts_val = event_texts[i_train:i_val] # event text descriptions
weather_feats_val = weather_feats[i_train:i_val,:] # weather data
y_val = preds[i_train:i_val] # target values

lags_test = lags[i_val:i_test,:]
event_feats_test = event_feats[i_val:i_test,:]
lags_event_feats_test = lags_event_feats[i_val:i_test,:]
event_texts_test = event_texts[i_val:i_test]
weather_feats_test = weather_feats[i_val:i_test,:]
y_test = preds[i_val:i_test]
trend_test = trends[i_val:i_test]
std_test = stds[i_val:i_test]


loading train/val/test split...


In [3]:
# ---------------------------------------- Evaluation functions

def compute_error(trues, predicted):
    corr = np.corrcoef(predicted, trues)[0,1]
    mae = np.mean(np.abs(predicted - trues))
    rae = np.sum(np.abs(predicted - trues)) / np.sum(np.abs(trues - np.mean(trues)))
    rmse = np.sqrt(np.mean((predicted - trues)**2))
    rrse = np.sqrt(np.sum((predicted - trues)**2) / np.sum((trues - np.mean(trues))**2))
    mape = np.mean(np.abs((predicted - trues) / trues)) * 100
    r2 = max(0, 1 - np.sum((predicted - trues)**2) / np.sum((trues - np.mean(trues))**2))
    return corr, mae, rae, rmse, rrse, mape, r2


def compute_error_filtered(trues, predicted, filt):
    trues = trues[filt]
    predicted = predicted[filt]
    corr = np.corrcoef(predicted, trues)[0,1]
    mae = np.mean(np.abs(predicted - trues))
    mse = np.mean((predicted - trues)**2)
    rae = np.sum(np.abs(predicted - trues)) / np.sum(np.abs(trues - np.mean(trues)))
    rmse = np.sqrt(np.mean((predicted - trues)**2))
    r2 = max(0, 1 - np.sum((trues-predicted)**2) / np.sum((trues - np.mean(trues))**2))
    return corr, mae, mse, rae, rmse, r2

In [4]:
# ---------------------------------------- Linear regression baseline (just lags)

# linear regression (just lags)
print("\nrunning linear regression with just lags...")
regr = linear_model.LinearRegression()
regr.fit(lags_train, y_train)
preds_lr = regr.predict(lags_test)
preds_lr = preds_lr * std_test + trend_test
y_true = y_test * std_test + trend_test
corr, mae, rae, rmse, rrse, mape, r2 = compute_error(y_true, preds_lr)
print("MAE:  %.3f\tRMSE: %.3f\tR2:   %.3f" % (mae, rmse, r2))


running linear regression with just lags...
MAE:  119.619	RMSE: 164.510	R2:   0.448


In [5]:
y_true

array([1201.93269231,  678.25274725,  963.78095238,  707.92783883,
        760.98076923, 1242.40384615, 1040.72115385,  916.93269231,
        611.25274725,  594.78095238,  671.92783883,  633.98076923,
        906.40384615,  986.72115385,  647.93269231,  211.25274725,
        588.78095238,  662.92783883, 1217.98076923, 1224.40384615,
       1080.72115385, 1146.93269231,  717.25274725,  628.78095238,
        766.92783883, 1111.98076923, 1018.40384615, 1061.72115385,
        993.93269231,  668.25274725,  761.78095238,  912.92783883,
       1045.98076923, 1289.40384615, 1475.72115385, 1330.93269231,
        733.25274725,  740.78095238, 1154.92783883, 1094.98076923,
       1100.40384615, 1163.72115385, 1018.93269231,  828.25274725,
        784.78095238, 1035.92783883, 1017.98076923, 1180.40384615,
       1038.72115385, 1291.93269231,  757.25274725, 1099.78095238,
        760.92783883, 1127.98076923, 1261.40384615, 1235.72115385,
        907.93269231, 1153.25274725,  815.78095238, 1015.92783

### We can see that the ground truth taxi demand  has become a decimal after preprocessing. Actually this is wrong, because the taxi demand  is an integer in the original data set

### This error is actually caused by the data normalization process. Specifically, preds is not aligned with trends and stds

### This error will change the ground truth taxi demand  and therefore affect the value of evaluation metrics. This error is fatal.

## In our experiment, this error has been corrected, and Taxi Demand Forecasting Based on the Multi-modal Information Fusion Graph Neural Network has been proposed.