# Load Sequential Mover Data

In [1]:
import json

file_name = 'data/2018_10/space_time_mover_feature.json'

with open(file_name, "r", encoding='UTF-8') as read_file:
    mover_id_seqs_dict = json.load(read_file)

print(len(mover_id_seqs_dict))

83227


In [2]:
for key, val in mover_id_seqs_dict.items():
    print(len(val)) # Number of Trips for a mover
    print(len(val[0])) # Number of feature dimensions for a trip
    print(val[0][0]) # A feature value
    break

25
34
0.0


# Torch

In [3]:
# CUDA Availability
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
mover_count = len(mover_id_seqs_dict)
max_seq_len = -1

tup_list = []
for mover_id, val in mover_id_seqs_dict.items():
    curr_seq_len = len(val)
    if max_seq_len < curr_seq_len:
        max_seq_len = curr_seq_len
    tup = (mover_id, curr_seq_len)
    tup_list.append(tup)

tup_list.sort(key=lambda x: x[1], reverse=True)

In [5]:
print(tup_list[-1][1])

4


In [6]:
import math
import random
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence
from torch import optim
from importlib import reload
import seq2seq as ss
reload(ss)

input_size = 34
output_size = 34
hidden_size = 256
mini_batch = 32
learning_rate=0.001

encoder = ss.EncoderMSE(input_size, hidden_size).to(device)
decoder = ss.DecoderMSE(input_size, hidden_size, output_size).to(device)
encoder_optimizer = optim.AdamW(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.AdamW(decoder.parameters(), lr=learning_rate)

criterion = nn.MSELoss()

In [7]:
packed_inputs = []
packed_inputs_d = []
batch_sizes = []
seq_sums = []

batch_count = math.ceil(mover_count / mini_batch)
for i in range(batch_count):
    batch_list = []
    seq_len = []
    seq_sum = 0
    for j in range(mini_batch):
        curr_idx = i * mini_batch + j
        if curr_idx < mover_count:
            m_id = tup_list[curr_idx][0]
            m_seq = mover_id_seqs_dict[m_id]
            m_tensor = torch.tensor(m_seq, dtype=torch.float, device=device)
            batch_list.append(m_tensor)
            seq_len.append(len(m_seq))
            seq_sum += len(m_seq)
            # print(m_tensor.shape)
    # print(seq_len)
    padded = pad_sequence(batch_list, batch_first=True)
    packed = pack_padded_sequence(padded, seq_len, batch_first=True)
    packed_inputs.append(packed)
    batch_sizes.append(len(seq_len))
    
    d_batch_list = []
    for t in batch_list:
        a = torch.zeros(1, input_size, device=device)
        b = t[:-1, :]
        c = torch.cat((a, b), 0)
        d_batch_list.append(c)
    padded = pad_sequence(d_batch_list, batch_first=True)
    packed = pack_padded_sequence(padded, seq_len, batch_first=True)
    packed_inputs_d.append(packed)
    seq_sums.append(seq_sum)
    
print(len(packed_inputs))
print(len(batch_sizes))
print(len(packed_inputs_d))
print(len(seq_sums))

2601
2601
2601
2601


In [8]:
import time

losses = []
epoch = 50

start_time = time.time()

# Minibatch iteration
for ep in range(epoch):
    
    indexes = list(range(len(packed_inputs)))
    random.shuffle(indexes)

    counter = 0
    ep_loss = 0
    for idx in indexes:

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        loss = 0

        encoder_input = packed_inputs[idx]
        batch_size = batch_sizes[idx]
        encoder_h0 = torch.zeros(1, batch_size, hidden_size, device=device)

        _, h_n = encoder(encoder_input, encoder_h0)

        decoder_input = packed_inputs_d[idx]
        decoder_h0 = h_n

        decoder_output, _ = decoder(decoder_input, decoder_h0)

        unpacked, unpacked_len = pad_packed_sequence(encoder_input, batch_first=True)

        if decoder_output.shape != unpacked.shape:
            print('Error')
            break

        loss = criterion(decoder_output, unpacked)

#         if counter % 200 == 0:
#             loss_val = loss.item() / seq_sums[idx] * 1000
#             print(counter, loss_val)
#             losses.append(loss_val)
        ep_loss += loss.item()

        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        # counter += 1
    print('ep: {} / loss: {}'.format(ep, ep_loss))
    losses.append(ep_loss)
    
elapsed_time = round(time.time() - start_time, 3)
print('Training Time : {}s'.format(elapsed_time))
    
# print('Final Avg :', sum(losses) / len(losses))
# ss.show_plot(losses)

ep: 0 / loss: 94.29853756539524
ep: 1 / loss: 53.90490900259465
ep: 2 / loss: 44.68914312822744
ep: 3 / loss: 39.5065615803469
ep: 4 / loss: 35.89267413970083
ep: 5 / loss: 33.77257702499628
ep: 6 / loss: 31.643703725654632
ep: 7 / loss: 30.03594262432307
ep: 8 / loss: 28.630087205208838
ep: 9 / loss: 27.42208548914641
ep: 10 / loss: 26.421024458832107
ep: 11 / loss: 25.544815358589403
ep: 12 / loss: 24.397146325209178
ep: 13 / loss: 23.574240879737772
ep: 14 / loss: 22.917159031610936
ep: 15 / loss: 22.2184208733961
ep: 16 / loss: 21.47734271886293
ep: 17 / loss: 21.001590630272403
ep: 18 / loss: 20.54298361047404
ep: 19 / loss: 20.23264828941319
ep: 20 / loss: 19.60151431831764
ep: 21 / loss: 19.16377099917736
ep: 22 / loss: 19.157031289010774
ep: 23 / loss: 18.336504135397263
ep: 24 / loss: 18.171917235886212
ep: 25 / loss: 17.786542741116136
ep: 26 / loss: 17.622201663441956
ep: 27 / loss: 17.404232125147246
ep: 28 / loss: 16.9472063651192
ep: 29 / loss: 16.99385205109138
ep: 30 / 

In [8]:
encoder_h0 = torch.zeros(1, 1, hidden_size, device=device)
encoder.eval()

m_ids = []
features = []
mover_embed_dict = {}

with torch.no_grad():
    for tup in tup_list:
        m_id = tup[0]
        m_seq = mover_id_seqs_dict[m_id]
        m_tensor = torch.tensor(m_seq, dtype=torch.float, device=device)
        padded = pad_sequence([m_tensor], batch_first=True)
        encoder_input = pack_padded_sequence(padded, [len(m_seq)], batch_first=True)
        _, h_n = encoder(encoder_input, encoder_h0)
        h_n = h_n.view(-1)
        h_n_np = h_n.cpu().numpy()
        
        m_ids.append(m_id)
        features.append(h_n_np)
        mover_embed_dict[m_id] = {
            'latent': h_n_np.tolist(),
            'umap': None
        }

In [9]:
print(len(mover_embed_dict))
for key, val in mover_embed_dict.items():
    print(key, len(val['latent']))
    break

81256
277596 256


In [11]:
import umap

def run_and_write_umap(nn):
    file_name = 'data/2018_01_2018_05/space_time_mover_umap_ep_50_nn_{}.json'.format(nn)
    print('nn: {} Start'.format(nn))
    
    reducer = umap.UMAP(
        n_neighbors=nn,
        n_epochs= 500
    )
    embedding = reducer.fit_transform(features)
    
    for idx, m_id in enumerate(m_ids):
        pt = embedding[idx]
        mover_embed_dict[m_id]['umap'] = pt.tolist()
        
    new_dict = {}
    for key, val in mover_embed_dict.items():
        if 'latent' in val:
            del val['latent']
        new_dict[key] = val['umap']
        
    with open(file_name, "w", encoding='UTF-8') as write_file:
        json.dump(new_dict, write_file, separators=(',', ':'), indent=4, ensure_ascii=False)
    
    print('nn: {} Done'.format(nn))

In [12]:
nn_list = [2, 5, 10, 20, 50, 100]

for nn in nn_list:
    run_and_write_umap(nn)


nn: 2 Start


failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initiali

nn: 2 Done
nn: 5 Start
nn: 5 Done
nn: 10 Start
nn: 10 Done
nn: 20 Start
nn: 20 Done
nn: 50 Start
nn: 50 Done
nn: 100 Start
nn: 100 Done


# Deprecated

In [26]:
import umap

reducer = umap.UMAP(
    n_neighbors=40,
    n_epochs= 500
)

embedding = reducer.fit_transform(features)
print(len(embedding))

83227


In [27]:
for idx, m_id in enumerate(m_ids):
    pt = embedding[idx]
    mover_embed_dict[m_id]['umap'] = pt.tolist()

In [28]:
print(len(mover_embed_dict))
for key, val in mover_embed_dict.items():
    print(key)
    print(val['umap'])
    break

83227
277596
[2.3555071353912354, -2.366137981414795]


In [29]:
import json

file_name = 'data/space_time/space_time_mover_latent_umap_ep_50.json'

with open(file_name, "w", encoding='UTF-8') as write_file:
    json.dump(mover_embed_dict, write_file, separators=(',', ':'), indent=4, ensure_ascii=False)

In [30]:
new_dict = {}
for key, val in mover_embed_dict.items():
    del val['latent']
    new_dict[key] = val
    
print(len(new_dict))
for key, val in new_dict.items():
    print(key)
    print(val)
    break

83227
277596
{'umap': [2.3555071353912354, -2.366137981414795]}


In [31]:
import json

file_name = 'data/space_time/space_time_mover_umap_ep_50.json'

with open(file_name, "w", encoding='UTF-8') as write_file:
    json.dump(new_dict, write_file, separators=(',', ':'), indent=4, ensure_ascii=False)