In [1]:
import numpy as np
import torch
import pandas as pd
import urllib.request
import zipfile

from utils.data_utils import get_full_df

from torch_geometric.utils import dense_to_sparse
from torch_geometric_temporal.signal import StaticGraphTemporalSignal

pd.options.display.max_columns = 50

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 117
TOTAL_DAYS = 400
START_POI = 0
END_POI = 400
TRAIN_RATIO = 0.7
VALID_RATIO = 0.2
TEST_RATIO = 1 - (TRAIN_RATIO + VALID_RATIO)
DATASET = 'Houston'
START_DATE = '2018-12-31'
WINDOW_SIZE = 24
HORIZON = 6

NUM_NODES = END_POI - START_POI

csv_path = f'/home/users/arash/datasets/safegraph/weekly_patterns_2019-01-07_2020-06-08_{DATASET}.csv'
poi_info_csv_path = '/home/users/arash/datasets/safegraph/core_poi_info_2019-01-07_2020-06-08.csv'


In [3]:
df = get_full_df(csv_path_weekly=csv_path, 
                         poi_info_csv_path=poi_info_csv_path, 
                         start_row=START_POI, end_row=END_POI, 
                         total_days=TOTAL_DAYS,
                         city=DATASET)

core_poi-part2.csv
core_poi-part5.csv
core_poi-part4.csv
core_poi-part3.csv
core_poi-part1.csv


In [4]:
df

Unnamed: 0,safegraph_place_id,visits_by_day,visits_by_each_hour,raw_visit_counts,location_name,street_address,city,region,postal_code,iso_country_code,safegraph_brand_ids,brands,parent_safegraph_place_id,top_category,sub_category,naics_code,latitude,longitude,phone_number,open_hours,category_tags
0,sg:4cc165ff43ec4ce29e9dbe0732267ab1,"[6140, 6399, 6752, 7324, 7255, 5874, 7496, 700...","[83, 49, 41, 60, 187, 301, 161, 151, 257, 112,...",3574654,George Bush Intercontinental Airport,2800 N Terminal Rd,Houston,TX,77032,US,,,,Support Activities for Air Transportation,Other Airport Operations,488119.0,29.981382,-95.322839,,,
1,sg:e8af4e248bbf41549aaec725d038ee42,"[2553, 2568, 2729, 2939, 2940, 2541, 3215, 290...","[21, 23, 21, 14, 52, 108, 53, 56, 130, 38, 119...",1427339,American Express Centurion Lounge,2800 N Terminal Rd Terminal D,Houston,TX,77032,US,,,sg:f4a955def8ca49fd87153af82a237245,Gasoline Stations,Gasoline Stations with Convenience Stores,447110.0,29.987829,-95.334916,,"{ ""Mon"": [[""5:30"", ""21:00""]], ""Tue"": [[""5:30"",...",
2,sg:01cebfb757224fbd8151ee6ac6b0d679,"[2122, 2538, 2558, 2641, 2911, 3741, 2484, 245...","[6, 5, 12, 4, 7, 20, 31, 40, 50, 58, 64, 97, 8...",1357595,Simon mall,5085 Westheimer Rd,Houston,TX,77056,US,SG_BRAND_0a3c99595c9d3fddfece9c4e7607e5b3,Simon mall,,Lessors of Real Estate,Malls,531120.0,29.738954,-95.463803,,"{ ""Mon"": [[""10:00"", ""19:00""]], ""Tue"": [[""10:00...",
3,sg:73e44ffdbcb24363bf01b6158373b9f3,"[2400, 2404, 2536, 2762, 2746, 2367, 3016, 268...","[15, 18, 16, 13, 47, 97, 49, 54, 123, 37, 114,...",1329978,George Bush Intercontinental Airport Terminal E,3950 S Terminal Terminal E,Houston,TX,77066,US,,,sg:7dea7a58e8424b22ba0d0b96bc1b6cc9,Traveler Accommodation,Hotels (except Casino Hotels) and Motels,721110.0,29.986670,-95.337942,,,
4,sg:360d88ef2ced4be180ea24290b9f9df4,"[2173, 2256, 2383, 2704, 2610, 1822, 2612, 248...","[36, 9, 11, 20, 71, 108, 63, 92, 44, 83, 68, 6...",1222177,William P Hobby Airport,7800 Airport Blvd,Houston,TX,77061,US,,,,Support Activities for Air Transportation,Other Airport Operations,488119.0,29.646145,-95.277014,1.713640e+10,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,sg:b65354230b3842c8b23144a0997a62d7,"[7, 204, 158, 162, 178, 74, 3, 165, 184, 185, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, ...",48299,Ulrich Int,10103 Spring Cypress Rd,Houston,TX,77070,US,,,,Elementary and Secondary Schools,Elementary and Secondary Schools,611110.0,30.019021,-95.572301,1.832376e+10,,
396,sg:d8e741af0be848d899ac3d7e8dd382ca,"[88, 109, 121, 111, 115, 147, 137, 118, 121, 1...","[0, 0, 1, 0, 0, 0, 1, 1, 1, 4, 2, 1, 1, 12, 6,...",48077,West U Marketplace,4004 Bellaire Blvd,Houston,TX,77025,US,,,,Lessors of Real Estate,Malls,531120.0,29.706636,-95.441673,,,
397,sg:18b3905a242544279285792988b7e502,"[78, 89, 72, 94, 92, 138, 121, 85, 79, 97, 93,...","[0, 0, 0, 0, 0, 0, 2, 0, 3, 2, 5, 3, 1, 7, 9, ...",48007,Target,300 Meyerland Plaza Mall,Houston,TX,77096,US,SG_BRAND_42aefbae01d2dfd981f7da7d823d689e,Target,sg:2a7741c069e84897b44a806033428829,Department Stores,Department Stores,452210.0,29.687158,-95.462103,,"{ ""Mon"": [[""8:00"", ""23:00""]], ""Tue"": [[""8:00"",...",
398,sg:96ff5ed1e94843658944043e3843ebf5,"[75, 90, 84, 91, 88, 85, 85, 84, 64, 71, 80, 8...","[1, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 2, 4, 7, 9, ...",47828,Kroger,12222 Jones Rd,Houston,TX,77070,US,SG_BRAND_1f852a23da4b7250,Kroger,sg:5982c58cfc3b4c9f82a9be4e29517b06,Grocery Stores,Supermarkets and Other Grocery (except Conveni...,445110.0,29.951781,-95.582386,1.832913e+10,"{ ""Mon"": [[""6:00"", ""24:00""]], ""Tue"": [[""6:00"",...",


In [5]:
def get_distances(coords):
    num_points = coords.shape[0]
    distances = np.array([[np.linalg.norm(i-j) for j in coords] for i in coords])
    return distances

In [6]:
def gaussian_kern(arr, thres=1):
    res = arr.copy()
    res[res<=thres] = np.exp(-(res[res<=thres]**2)/(np.nanstd(arr)**2))
    res[np.isnan(arr)] = 0
    res[arr>thres] = 0
    return res

In [7]:
coords = df[['latitude', 'longitude']].to_numpy()
distances = get_distances(coords)
thres = np.nanmean(distances)
adj_mat = gaussian_kern(distances, thres=thres)
adj_mat[0]

array([1.        , 0.99945533, 0.        , 0.99925591, 0.        ,
       0.        , 0.        , 0.9977724 , 0.        , 0.99956618,
       0.99843019, 0.        , 0.        , 0.        , 0.        ,
       0.8613359 , 0.        , 0.        , 0.        , 0.8339492 ,
       0.        , 0.96435911, 0.        , 0.86798093, 0.        ,
       0.        , 0.        , 0.86685492, 0.        , 0.        ,
       0.85893167, 0.        , 0.84394822, 0.        , 0.        ,
       0.85217214, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.85525912, 0.        , 0.        , 0.85106528,
       0.        , 0.87478512, 0.86664219, 0.        , 0.        ,
       0.        , 0.        , 0.86514126, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.92617109, 0.84202387, 0.        , 0.83853297, 0.        ,
       0.        , 0.90306411, 0.85677883, 0.86148734, 0.        ,
       0.        , 0.        , 0.        , 0.91258465, 0.     

In [8]:
data= pd.DataFrame(df["visits_by_each_hour"].to_list()).T

days = int(data.shape[0] / 24)

train_days = int(TRAIN_RATIO * days)
valid_days = int(days*VALID_RATIO)
test_days = days-train_days-valid_days

train_data = data[:train_days*24].to_numpy()
valid_data = data[train_days*24:(train_days + valid_days)*24].to_numpy()
test_data = data[(train_days + valid_days)*24:(train_days + valid_days+test_days)*24].to_numpy()

In [9]:
def get_unsqueezed(arr):
    time_points = arr.shape[0]
    nodes_num = arr.shape[-1]
    res = arr.copy()
    res = np.expand_dims(arr, 2)
    res = res.reshape((nodes_num, 1, time_points))
    return res

In [10]:
def z_normalize(arr):
    means = np.mean(arr, axis=(0, 2))
    res = arr - means.reshape(1, -1, 1)
    stds = np.std(res, axis=(0, 2))
    res = res / stds.reshape(1, -1, 1)
    return res

In [11]:
train_data, train_data.shape

(array([[ 83,  21,   6, ...,   0,   1,   0],
        [ 49,  23,   5, ...,   0,   0,   0],
        [ 41,  21,  12, ...,   0,   0,   0],
        ...,
        [174,  73,  22, ...,   0,   0,   0],
        [111,  56,  22, ...,   0,   1,   0],
        [131,  25,   9, ...,   0,   0,   0]]),
 (6720, 400))

In [12]:
td = z_normalize(get_unsqueezed(train_data))
td

array([[[ 2.16648252,  0.29337531, -0.1597958 , ..., -0.06916158,
         -0.00873876, -0.25043002]],

       [[-0.03895017, -0.12958439,  0.05168405, ..., -0.06916158,
         -0.25043002, -0.09937298]],

       [[-0.22021861, -0.31085283, -0.22021861, ..., -0.25043002,
         -0.31085283, -0.31085283]],

       ...,

       [[-0.34106424, -0.22021861, -0.28064142, ..., -0.03895017,
          0.32358671,  0.11210687]],

       [[-0.06916158, -0.03895017, -0.1597958 , ...,  0.56527797,
         -0.12958439, -0.1900072 ]],

       [[-0.34106424, -0.28064142, -0.31085283, ..., -0.34106424,
         -0.34106424, -0.34106424]]])

In [13]:
A = torch.from_numpy(adj_mat)
X = torch.from_numpy(td)
A.shape, X.shape

(torch.Size([400, 400]), torch.Size([400, 1, 6720]))

In [14]:
edge_indices, values = dense_to_sparse(A)
edge_indices, values

(tensor([[  0,   0,   0,  ..., 399, 399, 399],
         [  0,   1,   3,  ..., 397, 398, 399]]),
 tensor([1.0000, 0.9995, 0.9993,  ..., 0.9181, 0.9225, 1.0000],
        dtype=torch.float64))

In [15]:
edges = edge_indices.numpy()
edge_weights = values.numpy()

### Generate observations

In [16]:
num_timesteps_in = WINDOW_SIZE
num_timesteps_out = HORIZON

In [17]:
indices = [
    (i, i + (num_timesteps_in + num_timesteps_out))
    for i in range(X.shape[2] - (num_timesteps_in + num_timesteps_out) + 1)
]
print('len:', len(indices))
print(indices[:5])
print('...')
print(indices[-2:])

len: 6691
[(0, 30), (1, 31), (2, 32), (3, 33), (4, 34)]
...
[(6689, 6719), (6690, 6720)]


In [18]:
features, target = [], []
for i, j in indices:
    features.append((X[:, :, i : i + num_timesteps_in]).numpy())
    target.append((X[:, 0, i + num_timesteps_in : j]).numpy())
len(features)

6691

In [19]:
dataset = StaticGraphTemporalSignal(
    edges, edge_weights, features, target
)

In [20]:
dataset

<torch_geometric_temporal.signal.static_graph_temporal_signal.StaticGraphTemporalSignal at 0x7f190f8c38e0>

In [21]:
dataset_iter = iter(dataset)
next(dataset_iter)

Data(x=[400, 1, 24], edge_index=[2, 100638], edge_attr=[100638], y=[400, 6])

In [22]:
[bucket.x for bucket in list(dataset)[:3]]

[tensor([[[ 2.1665,  0.2934, -0.1598,  ..., -0.2202, -0.1598, -0.1900]],
 
         [[-0.0390, -0.1296,  0.0517,  ..., -0.2202, -0.1296, -0.3109]],
 
         [[-0.2202, -0.3109, -0.2202,  ..., -0.1296, -0.3411, -0.0692]],
 
         ...,
 
         [[-0.3411, -0.2202, -0.2806,  ..., -0.2202,  0.0819, -0.3109]],
 
         [[-0.0692, -0.0390, -0.1598,  ..., -0.2504,  0.2632,  0.1423]],
 
         [[-0.3411, -0.2806, -0.3109,  ..., -0.3411, -0.2504, -0.2504]]]),
 tensor([[[ 0.2934, -0.1598,  0.1121,  ..., -0.1598, -0.1900, -0.3411]],
 
         [[-0.1296,  0.0517, -0.1900,  ..., -0.1296, -0.3109, -0.0390]],
 
         [[-0.3109, -0.2202,  0.7163,  ..., -0.3411, -0.0692, -0.0087]],
 
         ...,
 
         [[-0.2202, -0.2806, -0.1900,  ...,  0.0819, -0.3109, -0.3109]],
 
         [[-0.0390, -0.1598, -0.1598,  ...,  0.2632,  0.1423, -0.2806]],
 
         [[-0.2806, -0.3109, -0.2504,  ..., -0.2504, -0.2504, -0.3411]]]),
 tensor([[[-0.1598,  0.1121,  0.7465,  ..., -0.1900, -0.3411, -0.310

In [23]:
from torch_geometric_temporal.signal import temporal_signal_split


### Testing modesl

In [24]:
import torch
import torch.nn.functional as F
from torch_geometric_temporal.nn.recurrent import A3TGCN

class A3TGCN_Temporal(torch.nn.Module):
    def __init__(self, node_features, periods):
        super(A3TGCN_Temporal, self).__init__()
        # Attention Temporal Graph Convolutional Cell
        self.tgnn = A3TGCN(in_channels=node_features, 
                           out_channels=128, 
                           periods=periods)
        # Equals single-shot prediction
        self.linear = torch.nn.Linear(128, periods)

    def forward(self, x, edge_index):
        """
        x = Node features for T time steps
        edge_index = Graph edge indices
        """
        h = self.tgnn(x, edge_index)
        h = F.relu(h)
        h = self.linear(h)
        return h

In [25]:
A3TGCN_Temporal(node_features=1, periods=HORIZON)

A3TGCN_Temporal(
  (tgnn): A3TGCN(
    (_base_tgcn): TGCN(
      (conv_z): GCNConv(1, 128)
      (linear_z): Linear(in_features=256, out_features=128, bias=True)
      (conv_r): GCNConv(1, 128)
      (linear_r): Linear(in_features=256, out_features=128, bias=True)
      (conv_h): GCNConv(1, 128)
      (linear_h): Linear(in_features=256, out_features=128, bias=True)
    )
  )
  (linear): Linear(in_features=128, out_features=6, bias=True)
)

In [27]:
# GPU support
device = torch.device('cpu') # cuda


# Create model and optimizers
model = A3TGCN_Temporal(node_features=1, periods=HORIZON).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
model.train()

print("Running training...")
for epoch in range(20): 
    loss = 0
    step = 0
    for snapshot in iter(dataset):
        snapshot = snapshot.to(device)
        # Get model predictions
        y_hat = model(snapshot.x, snapshot.edge_index)
        # Mean squared error
        loss = loss + torch.mean((y_hat-snapshot.y)**2) 
        step += 1


    loss = loss / (step + 1)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    print("Epoch {} train MSE: {:.4f}".format(epoch, loss.item()))

Running training...


KeyboardInterrupt: 