In [None]:
import numpy as np
import pandas as pd
import Geohash as gh

# Running this notebook requires the following files and folders:
#     1. "./training.csv"

# Running this notebook outputs the following files and folders:
#     1. "./full_matrix.npy"
#     2. "./train_matrix.npy"
#     3. "./test_matrix.npy"
#     4. "./coords_mapping.npy"

In [None]:
df = pd.read_csv("./training.csv")

In [None]:
#Pre-process the data into time-series data first
#Define timestamp to be have one day cycle, ie 24hrs x 4 (15 min buckets) = 96

#reformat timestamp
ts_df = pd.DataFrame({'timestamp' : df['timestamp'].unique()})
ts_df[['h','m']]=ts_df['timestamp'].apply(lambda cell: pd.Series(cell.split(":")))
ts_df['h'] = ts_df['h'].apply(lambda cell:int(cell)*4)
ts_df['m'] = ts_df['m'].apply(lambda cell:int(cell)/15)
ts_df['new_ts']= ts_df['h'] + ts_df['m']
df = df.merge(ts_df.drop(['h','m'],axis=1), on ='timestamp', how='inner')

#Get lattitude & longtitude using geohash library & index them for CNN input
coords_df = pd.DataFrame({'geohash6' : df['geohash6'].unique()})
coords_df[['lat','long','lat_err','long_err']]=coords_df['geohash6'].apply(lambda cell: pd.Series(gh.decode_exactly(cell))).astype('float64')

map_lat_df=pd.DataFrame({'lat': coords_df['lat'].unique()}).sort_values(by=['lat'],ascending=True).reset_index(drop=True)
map_lat_df = map_lat_df.reset_index().rename(columns={'index':'lat_index'})
map_long_df=pd.DataFrame({'long': coords_df['long'].unique()}).sort_values(by=['long'],ascending=True).reset_index(drop=True)
map_long_df = map_long_df.reset_index().rename(columns={'index':'long_index'})

coords_df = coords_df.merge(map_lat_df, on='lat', how='inner')
coords_df = coords_df.merge(map_long_df, on='long', how='inner')

# Store coords_df to CSV file in the current directory for mapping of test data to lat / long index
coords_df.to_csv("./coords_mapping.csv",index=False)

df = df.merge(coords_df.drop(columns=['lat','long','lat_err','long_err']), on='geohash6', how='inner')
df = df.drop(columns = ['geohash6','timestamp'])

In [None]:
df['time-series'] = df['day'].apply(lambda cell: (cell-1)*24*4) + df['new_ts']
df['time-series'] = df['time-series'] - df['time-series'].min()
df['time-series'] = df['time-series'].astype(int)
df = df.drop(columns = ['day','new_ts'])
df = df.sort_values(by=['time-series','long_index'],ascending=True)

In [None]:
# Convert the data into matrix of shape [n_samples, time-series, longtitude, lattitude, depth] to feed into ConvLSTM for training
import time
t1= time.time()
time_grouped = df.groupby('time-series')

n_samples = 1
n_frames = df['time-series'].max().astype(int) + 1
long = df['long_index'].max().astype(int) + 1
lat = df['lat_index'].max().astype(int) + 1

input_matrix = np.zeros((n_samples, n_frames, long, lat, 1), dtype=np.float)
# print(input_matrix.shape)

#Create an array of zeros to fill in Dataframe, then use max() on 'demand'
#So that can bulk insert columns of matrix, to speed up operations instead of iterating through each "lattitude" matrix index
a = np.zeros((46,4), dtype=np.int)
a[:, 1] =  np.linspace(0, 45, 46,dtype=int)
df_zeros = pd.DataFrame({'demand':a[:,0],'lat_index':a[:,1],'long_index':a[:,2],'time-series':a[:,3]})

#time_val here is the values from 'time-series', also the index number for the time   
for time_val, time_dfs in time_grouped:    
    long_group = time_dfs.groupby('long_index')
    print('Mapping Data into 2D: {} of {}.'.format(time_val+1,n_frames), end='\r')    
    #long_val here is the values from 'long_index', also the index number for the row
    for long_val, long_dfs in long_group:        
        long_dfs = long_dfs.append(df_zeros)
        long_dfs = long_dfs.groupby('lat_index').max().reset_index()
        b = long_dfs['demand'].values        
        c = np.matrix(b.tolist())
        input_matrix[n_samples-1,time_val,long_val,:,0]=c.reshape(46)
#         print(c)
#         print(long_dfs)
print('Operation completed in ' + str(time.time()-t1))

In [None]:
#Splits and saves the train, test and full matrix into .npy files
Split_at_Day = 46
Split_index = Split_at_Day*24*4

full_matrix = input_matrix[:,:,:,:,:]
train_matrix = input_matrix[:,:Split_index,:,:,:]
test_matrix = input_matrix[:,Split_index:,:,:,:]

np.save('./full_matrix.npy', full_matrix)
np.save('./train_matrix.npy', train_matrix)
np.save('./test_matrix.npy', test_matrix)