In [5]:
%cd '/content/drive/MyDrive/dev/aiac/TRBAIAC-TRAFFIC'

/content/drive/MyDrive/dev/aiac/TRBAIAC-TRAFFIC


In [2]:
import torch,os
import torch.nn as nn
import torch.utils.data as utils
import torch.nn.functional as F
import numpy as np
import pandas as pd
import time
from torch.autograd import Variable

In [None]:
# Download training, testing and shapefile data
!gdown 'https://drive.google.com/uc?id=1E_qqe7kfvfApM4hCOBMPoXhyEPyrUJkN'
!gdown 'https://drive.google.com/uc?id=1j-3-lHegY--FDHKZvz86HBtilV2dsP-i'
!gdown 'https://drive.google.com/uc?id=1AhAb3yEVxcHSaz0ADlyyYwNH2Gqmr-um'
!unzip 'train.zip' -d '.'
!unzip 'test.zip' -d '.'
!unzip 'Geoshapefile.zip' -d '.'

In [3]:
# Data Preparation
def reshape_data(rawdata):
    reshaped_tps_df = pd.DataFrame()
    reshaped_tps_df['TIME'] = rawdata.time.unique()
    for seg in rawdata.segmentID.unique():
        column = rawdata[rawdata['segmentID'] == seg][['time','TrafficIndex_GP']].drop_duplicates(subset=['time'])
        column.columns = ['TIME', str(seg)]
        reshaped_tps_df = reshaped_tps_df.join(column.set_index('TIME'), on='TIME')

    return reshaped_tps_df

def load_data(filepath, start_time, end_time, freq):
    rawdata = pd.read_pickle(filepath)
    matrix = reshape_data(rawdata)
    matrix['TIME'] = pd.to_datetime(matrix['TIME']).dt.strftime('%Y-%m-%d %H:%M:%S')

    dt_idx = pd.date_range(start=start_time, end=end_time, freq=freq)

    output = pd.DataFrame(dt_idx)
    output.columns = ['TIME']
    output['TIME'] = pd.to_datetime(output['TIME']).dt.strftime('%Y-%m-%d %H:%M:%S')
    output = output.set_index('TIME').join(matrix.set_index('TIME'))

    return output

def prepare_dataloader(matrix, n_col, seq_len=36, pred_len=12, BATCH_SIZE=32, device='cpu'):
    seg = matrix.columns.values
    time = matrix.index.values
    n_seg = len(seg)
    n_time = len(time)
    
    speedMatrix = matrix.to_numpy()
    
    data_set = []
    label_set = []

    for i in range(n_time - seq_len - pred_len):
        data = speedMatrix[i : i + seq_len]
        
        label_data = speedMatrix[i + seq_len: i + seq_len + pred_len, :n_col]
        
        if np.isnan(np.sum(data[:n_col])).any() | np.isnan(np.sum(label_data)):
            pass
        else:

            data_set.append(data)
            label_set.append(label_data)
            
    data = np.array(data_set)
    label = np.array(label_set)

    train_ind = int(len(data)* 0.8)
    valid_ind = int(len(data) * 0.9)
    test_ind = int(len(data) * 1.0)

    X_train = data[: train_ind]
    X_valid = data[train_ind : valid_ind]
    X_test = data[valid_ind : test_ind]
    Y_train = label[: train_ind]
    Y_valid = label[train_ind : valid_ind]
    Y_test = label[valid_ind : test_ind]

    X_train = torch.FloatTensor(X_train).to(device)
    X_valid = torch.FloatTensor(X_valid).to(device)
    X_test = torch.FloatTensor(X_test).to(device)
    Y_train = torch.FloatTensor(Y_train).to(device)
    Y_valid = torch.FloatTensor(Y_valid).to(device)
    Y_test = torch.FloatTensor(Y_test).to(device)

    train_dataset = utils.TensorDataset(X_train, Y_train)
    valid_dataset = utils.TensorDataset(X_valid, Y_valid)
    test_dataset = utils.TensorDataset(X_test, Y_test)

    train_dataloader = utils.DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle=True, drop_last = True)
    valid_dataloader = utils.DataLoader(valid_dataset, batch_size = BATCH_SIZE, shuffle=True, drop_last = True)
    test_dataloader = utils.DataLoader(test_dataset, batch_size = BATCH_SIZE, shuffle=False, drop_last = False)

    return train_dataloader, valid_dataloader, test_dataloader

In [4]:
# Model Implementation
class lstm_encoder(nn.Module):
    ''' Encodes time-series sequence '''

    def __init__(self, input_size, hidden_size, num_layers = 2):
        
        super(lstm_encoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True)

    def forward(self, x_input):
        
        use_gpu = torch.cuda.is_available()
        if use_gpu:
            Hidden_State = Variable(torch.zeros(self.num_layers, x_input.size(0), self.hidden_size).cuda())
            Cell_State = Variable(torch.zeros(self.num_layers, x_input.size(0), self.hidden_size).cuda())
        else:
            Hidden_State = Variable(torch.zeros(self.num_layers, x_input.size(0), self.hidden_size))
            Cell_State = Variable(torch.zeros(self.num_layers, x_input.size(0), self.hidden_size))
        
        lstm_out, self.hidden = self.lstm(x_input, (Hidden_State, Cell_State))
        return lstm_out, self.hidden

class lstm_decoder(nn.Module):
    
    def __init__(self, input_size, hidden_size, num_layers = 2):
        
        super(lstm_decoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True)       

    def forward(self, x_input, encoder_hidden_states):
        lstm_out, self.hidden = self.lstm(x_input.unsqueeze(1), encoder_hidden_states)      
        return lstm_out.squeeze(1), self.hidden

class LSTM_Seq2Seq(nn.Module):

    def __init__(self, encoder, decoder, target_len):
        super(LSTM_Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.target_len = target_len
        
    def forward(self, inputs):
    
        batch_size = inputs.shape[0]
        output_feature = inputs.shape[2]
        outputs = None
    
        encoder_output, encoder_hidden = self.encoder(inputs)
        decoder_input = inputs[:, -1, :]
        decoder_hidden = encoder_hidden
                
        for t in range(self.target_len): 
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
            if outputs is None:
                outputs = decoder_output.unsqueeze(1)
            else:
                outputs = torch.cat((decoder_output.unsqueeze(1), outputs), 1)
            decoder_input = decoder_output
        return outputs

def TrainModel_LSTM_Seq2Seq(model, train_dataloader, valid_dataloader, learning_rate, num_epochs, min_delta, use_gpu, patience):
    
    loss_MSE = torch.nn.MSELoss()
    optimizer = torch.optim.RMSprop(model.parameters(), lr = learning_rate)
    
    cur_time = time.time()
    pre_time = time.time()

    # Variables for Early Stopping
    is_best_model = 0
    patient_epoch = 0
       
    for epoch in range(num_epochs):

        total_train_loss = 0
        total_valid_loss = 0

        for data in train_dataloader:

            inputs, labels = data
            batch_size = inputs.shape[0]
            
            if inputs.shape[0] != batch_size:
                continue
                
            if use_gpu:
                inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())
            else:
                inputs, labels = Variable(inputs), Variable(labels)

            model.train()
            outputs = model(inputs)

            loss_train = loss_MSE(outputs, torch.squeeze(labels))
            total_train_loss += loss_train.data

            optimizer.zero_grad()
            loss_train.backward()
            optimizer.step()

        model.eval()

        for data in valid_dataloader:

            inputs, labels = data
            batch_size = inputs.shape[0]
            
            if inputs.shape[0] != batch_size:
                continue
                
            if use_gpu:
                inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())
            else:
                inputs, labels = Variable(inputs), Variable(labels)
  
            outputs_val= model(inputs)
            loss_valid = loss_MSE(outputs_val, torch.squeeze(labels))
            total_valid_loss += loss_valid.data

        avg_losses_epoch_train = total_train_loss / float(len(train_dataloader))
        avg_losses_epoch_valid = total_valid_loss / float(len(valid_dataloader))

        # Early Stopping
        if epoch == 0:
            is_best_model = 1
            best_model = model
            min_loss_epoch_valid = 10000.0
            if avg_losses_epoch_valid < min_loss_epoch_valid:
                min_loss_epoch_valid = avg_losses_epoch_valid
        else:
            if min_loss_epoch_valid - avg_losses_epoch_valid > min_delta:
                is_best_model = 1
                best_model = model
                min_loss_epoch_valid = avg_losses_epoch_valid
                patient_epoch = 0
            else:
                is_best_model = 0
                patient_epoch += 1
                if patient_epoch >= patience:
                    print('Early Stopped at Epoch:', epoch)
                    break

        cur_time = time.time()
        print('Epoch: {}, train_loss: {}, valid_loss: {}, time: {}, best model: {}'.format( \
                    epoch, \
                    np.around(avg_losses_epoch_train.cpu(), decimals=8),\
                    np.around(avg_losses_epoch_valid.cpu(), decimals=8),\
                    np.around([cur_time - pre_time] , decimals=2),\
                    is_best_model) )
        pre_time = cur_time

    return best_model

In [6]:
BATCH_SIZE = 40
INPUT_LEN = 36
PRED_LEN = 12
LEARNING_RATE = 1e-5
NUM_EPOCHS = 100
MIN_DELTA = 5e-4
PATIENCE = 10

BASELINE_MODEL_INPUT_DIMENSION = 87
BASELINE_MODEL_HIDDEN_DIMENSION = 87
BASELINE_MODEL_OUTPUT_DIMENSION = 87

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
use_gpu = torch.cuda.is_available()

In [8]:
print(use_gpu)

True


In [9]:
data_matrix = load_data("./tps_df.pkl", "2020-01-01 00:00:00.000", "2020-05-31 23:45:00.000", freq="15min")
input_dim = data_matrix.shape[-1]
train_dataloader, valid_dataloader, _ = prepare_dataloader(data_matrix, input_dim, BATCH_SIZE=BATCH_SIZE, seq_len=INPUT_LEN, pred_len=PRED_LEN, device=device)

In [10]:
encoder = lstm_encoder(input_size=BASELINE_MODEL_INPUT_DIMENSION, hidden_size=BASELINE_MODEL_HIDDEN_DIMENSION)
decoder = lstm_decoder(input_size=BASELINE_MODEL_HIDDEN_DIMENSION, hidden_size=BASELINE_MODEL_OUTPUT_DIMENSION)
seq2seq_model = LSTM_Seq2Seq(encoder, decoder, PRED_LEN).to(device)

In [11]:
seq2seq_best_model = TrainModel_LSTM_Seq2Seq(seq2seq_model, train_dataloader, valid_dataloader, num_epochs=NUM_EPOCHS, 
                                              learning_rate=LEARNING_RATE, 
                                              min_delta=MIN_DELTA, 
                                              use_gpu=use_gpu,
                                              patience=PATIENCE)

Epoch: 0, train_loss: 0.8676244020462036, valid_loss: 0.7941045761108398, time: [2.98], best model: 1
Epoch: 1, train_loss: 0.5174614191055298, valid_loss: 0.33493682742118835, time: [2.87], best model: 1
Epoch: 2, train_loss: 0.21901479363441467, valid_loss: 0.15726694464683533, time: [2.86], best model: 1
Epoch: 3, train_loss: 0.11146687716245651, valid_loss: 0.08382993191480637, time: [2.89], best model: 1
Epoch: 4, train_loss: 0.06320104002952576, valid_loss: 0.0451960414648056, time: [2.91], best model: 1
Epoch: 5, train_loss: 0.038029540330171585, valid_loss: 0.02455786056816578, time: [2.88], best model: 1
Epoch: 6, train_loss: 0.024972179904580116, valid_loss: 0.013418110087513924, time: [3.48], best model: 1
Epoch: 7, train_loss: 0.018265610560774803, valid_loss: 0.007635519839823246, time: [4.35], best model: 1
Epoch: 8, train_loss: 0.015055689960718155, valid_loss: 0.00472575007006526, time: [4.85], best model: 1
Epoch: 9, train_loss: 0.013519249856472015, valid_loss: 0.0034

In [27]:
'''
Step1: read and reshape data
Step2: feed into trained model
Step3: convert output to JSON file
Here we take one testing data as an example to convert it to JSON file.
In this challenge, you should convert all 15 predicting results to one JSON file.
Please check https://colab.research.google.com/drive/1Hkt3kQuh7WzwUTnLgKCcvfPAV1CUK8lF?usp=sharing for more information of the expected result.
'''
test_files = os.listdir('test_data')
out_cols = ['11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22',
       '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34',
       '35', '36', '37', '38', '39', '40', '42', '43', '44', '45', '46', '47',
       '49', '50', '51', '52', '53', '54', '55', '58', '59', '60', '61', '62',
       '63', '64', '65', '66', '67', '68', '69', '70', '73', '74', '75', '76',
       '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88',
       '89', '90', '91', '92', '93', '94', '95', '96', '97', '99', '100',
       '102', '103', '104', '106']

horizon = {'tps_1.pkl':['2020-06-02 06:15:00', '2020-06-02 09:00:00'],
           'tps_2.pkl':['2020-06-03 07:15:00', '2020-06-03 10:00:00'],
           'tps_3.pkl':['2020-06-04 08:15:00', '2020-06-04 11:00:00'],
           'tps_4.pkl':['2020-06-05 09:15:00', '2020-06-05 12:00:00'],
           'tps_5.pkl':['2020-06-06 10:15:00', '2020-06-06 13:00:00'],
           'tps_6.pkl':['2020-06-07 11:15:00', '2020-06-07 14:00:00'],
           'tps_7.pkl':['2020-06-08 12:15:00', '2020-06-08 15:00:00'],
           'tps_8.pkl':['2020-06-09 13:15:00', '2020-06-09 16:00:00'],
           'tps_9.pkl':['2020-06-10 14:15:00', '2020-06-10 17:00:00'],
           'tps_10.pkl':['2020-06-11 15:15:00', '2020-06-11 18:00:00'],
           'tps_11.pkl':['2020-06-12 16:15:00', '2020-06-12 19:00:00'],
           'tps_12.pkl':['2020-06-13 17:15:00', '2020-06-13 20:00:00'],
           'tps_13.pkl':['2020-06-14 18:15:00', '2020-06-14 21:00:00'],
           'tps_14.pkl':['2020-06-15 19:15:00', '2020-06-15 22:00:00'],
           'tps_15.pkl':['2020-06-16 20:15:00', '2020-06-16 23:00:00'],
           }
out_df = pd.DataFrame(columns = out_cols)
for test_file in test_files:
  print (os.path.join('test_data',test_file))
  tps_1_raw = pd.read_pickle(os.path.join('test_data',test_file))
  # print (tps_1_raw.head())
  reshaped_tps_df = reshape_data(tps_1_raw)
  reshaped_tps_df = reshaped_tps_df.set_index('TIME')
  reshaped_tps_value = reshaped_tps_df.values
  reshaped_tps_value = np.expand_dims(reshaped_tps_value, axis=0)
  reshaped_tps_value = torch.from_numpy(reshaped_tps_value).float().to(device)

  seq2seq_out = seq2seq_best_model(reshaped_tps_value).squeeze(0)

  output_1 = pd.DataFrame(seq2seq_out.cpu().detach().numpy())
  # output_1.index = pd.date_range(start='2020-06-02 06:15:00', end='2020-06-02 09:00:00', freq='15min').astype(int) / 10**9
  st = horizon[test_file][0]; et = horizon[test_file][1]
  print (st, et)
  output_1.index = pd.date_range(start=st, end=et, freq='15min').astype(int) / 10**9
  output_1.columns = reshaped_tps_df.columns
  
  out_df = pd.concat([out_df,output_1])

# out_df.reset_index(inplace=True)
out_df.to_json('./traffic_forecasting_result_all.json')

test_data/tps_1.pkl
2020-06-02 06:15:00 2020-06-02 09:00:00
test_data/tps_2.pkl




2020-06-03 07:15:00 2020-06-03 10:00:00
test_data/tps_3.pkl




2020-06-04 08:15:00 2020-06-04 11:00:00
test_data/tps_4.pkl




2020-06-05 09:15:00 2020-06-05 12:00:00
test_data/tps_5.pkl




2020-06-06 10:15:00 2020-06-06 13:00:00
test_data/tps_6.pkl




2020-06-07 11:15:00 2020-06-07 14:00:00
test_data/tps_7.pkl




2020-06-08 12:15:00 2020-06-08 15:00:00
test_data/tps_8.pkl




2020-06-09 13:15:00 2020-06-09 16:00:00
test_data/tps_9.pkl




2020-06-10 14:15:00 2020-06-10 17:00:00
test_data/tps_10.pkl




2020-06-11 15:15:00 2020-06-11 18:00:00
test_data/tps_11.pkl




2020-06-12 16:15:00 2020-06-12 19:00:00
test_data/tps_12.pkl




2020-06-13 17:15:00 2020-06-13 20:00:00
test_data/tps_13.pkl




2020-06-14 18:15:00 2020-06-14 21:00:00
test_data/tps_14.pkl




2020-06-15 19:15:00 2020-06-15 22:00:00
test_data/tps_15.pkl




2020-06-16 20:15:00 2020-06-16 23:00:00


