## 0. Import Library

In [1]:
import torch.utils.data as utils
import torch.nn.functional as F
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn.parameter import Parameter
import numpy as np
import pandas as pd
import math
import time

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
print(torch.__version__)

2.1.2+cpu


## Load Example Dataset

In [4]:
import pandas as pd

# Specify the file path
file_path = 'inrx_data.txt'

# Read data from the text file into a pandas DataFrame
df = pd.read_csv(file_path, delimiter='\t')  

# Print or use the resulting DataFrame
df

Unnamed: 0,stamp,SegmentID,CabName,DayOfWeek,Direction,mpdirection,InrxSpeed
0,2010-01-01 00:00:00.000,114+04179,005es14630,6,d,,65
1,2010-01-01 00:05:00.000,114+04179,005es14630,6,d,,65
2,2010-01-01 00:10:00.000,114+04179,005es14630,6,d,,65
3,2010-01-01 00:15:00.000,114+04179,005es14630,6,d,,65
4,2010-01-01 00:20:00.000,114+04179,005es14630,6,d,,65
...,...,...,...,...,...,...,...
995,2010-01-01 07:35:00.000,114+04183,005es15348,6,d,,65
996,2010-01-01 07:40:00.000,114+04183,005es15348,6,d,,65
997,2010-01-01 07:45:00.000,114+04183,005es15348,6,d,,65
998,2010-01-01 07:50:00.000,114+04183,005es15348,6,d,,65


In [26]:
# filter only segment 114+04179
data179 = df[df['SegmentID']=='114+04179']
speed_matrix = data179[['InrxSpeed','DayOfWeek']]

## 1. Prepare Dataset

In [33]:
def PrepareDataset(speed_matrix, BATCH_SIZE = 40, seq_len = 10, pred_len = 1, train_propotion = 0.7, valid_propotion = 0.2):
    """ Prepare training and testing datasets and dataloaders.
    
    Convert speed/volume/occupancy matrix to training and testing dataset. 
    The vertical axis of speed_matrix is the time axis and the horizontal axis 
    is the spatial axis.
    
    Args:
        speed_matrix: a Matrix containing spatial-temporal speed data for a network
        seq_len: length of input sequence
        pred_len: length of predicted sequence
    Returns:
        Training dataloader
        Testing dataloader
    """
    # KUKUU: speed_matrix.shape[0] = number of rows
    time_len = speed_matrix.shape[0]
    
    # Normalization of Speed Matrix
    max_speed = speed_matrix.max().max()
    speed_matrix =  speed_matrix / max_speed
    
    # Sequence Generation
    speed_sequences, speed_labels = [], []
    for i in range(time_len - seq_len - pred_len):
        speed_sequences.append(speed_matrix.iloc[i:i+seq_len].values)
        speed_labels.append(speed_matrix.iloc[i+seq_len:i+seq_len+pred_len].values)
    speed_sequences, speed_labels = np.asarray(speed_sequences), np.asarray(speed_labels)
    print(speed_sequences)
    print(speed_labels)
    
    # shuffle and split the dataset to training and testing datasets
    sample_size = speed_sequences.shape[0]
    index = np.arange(sample_size, dtype = int)
    np.random.shuffle(index)
    
    train_index = int(np.floor(sample_size * train_propotion))
    valid_index = int(np.floor(sample_size * ( train_propotion + valid_propotion)))
    
    train_data, train_label = speed_sequences[:train_index], speed_labels[:train_index]
    valid_data, valid_label = speed_sequences[train_index:valid_index], speed_labels[train_index:valid_index]
    test_data, test_label = speed_sequences[valid_index:], speed_labels[valid_index:]
    
    # Conversion to PyTorch Tensors
    train_data, train_label = torch.Tensor(train_data), torch.Tensor(train_label)
    valid_data, valid_label = torch.Tensor(valid_data), torch.Tensor(valid_label)
    test_data, test_label = torch.Tensor(test_data), torch.Tensor(test_label)
    
    train_dataset = utils.TensorDataset(train_data, train_label)
    valid_dataset = utils.TensorDataset(valid_data, valid_label)
    test_dataset = utils.TensorDataset(test_data, test_label)
    
    # Dataloader Creation
    train_dataloader = utils.DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle=True, drop_last = True)
    valid_dataloader = utils.DataLoader(valid_dataset, batch_size = BATCH_SIZE, shuffle=True, drop_last = True)
    test_dataloader = utils.DataLoader(test_dataset, batch_size = BATCH_SIZE, shuffle=True, drop_last = True)
    
    return train_dataloader, valid_dataloader, test_dataloader, max_speed

In [None]:
"""
if __name__ == "__main__":
#     data = 'inrix'
    data = 'loop'
    directory = '../../Data_Warehouse/Data_network_traffic/'
    # KUKUU: inrix is sensor company name
    if data == 'inrix':
        # KUKUU: read file object 
        speed_matrix =  pd.read_pickle( directory + 'inrix_seattle_speed_matrix_2012')
        A = np.load(directory + 'INRIX_Seattle_2012_A.npy')
        # KUKUU: .npy is binary file, this is to check if free flow speed is reach? 
        FFR_5min = np.load(directory + 'INRIX_Seattle_2012_reachability_free_flow_5min.npy')
        FFR_10min = np.load(directory + 'INRIX_Seattle_2012_reachability_free_flow_10min.npy')
        FFR_15min = np.load(directory + 'INRIX_Seattle_2012_reachability_free_flow_15min.npy')
        FFR_20min = np.load(directory + 'INRIX_Seattle_2012_reachability_free_flow_20min.npy')
        FFR_25min = np.load(directory + 'INRIX_Seattle_2012_reachability_free_flow_25min.npy')
        FFR = [FFR_5min, FFR_10min, FFR_15min, FFR_20min, FFR_25min]
    elif data == 'loop':
        speed_matrix =  pd.read_pickle( directory + 'speed_matrix_2015')
        A = np.load( directory + 'Loop_Seattle_2015_A.npy')
        FFR_5min = np.load( directory + 'Loop_Seattle_2015_reachability_free_flow_5min.npy')
        FFR_10min = np.load( directory + 'Loop_Seattle_2015_reachability_free_flow_10min.npy')
        FFR_15min = np.load( directory + 'Loop_Seattle_2015_reachability_free_flow_15min.npy')
        FFR_20min = np.load( directory + 'Loop_Seattle_2015_reachability_free_flow_20min.npy')
        FFR_25min = np.load( directory + 'Loop_Seattle_2015_reachability_free_flow_25min.npy')
        FFR = [FFR_5min, FFR_10min, FFR_15min, FFR_20min, FFR_25min]
"""

In [35]:
train_dataloader, valid_dataloader, test_dataloader, max_speed = PrepareDataset(speed_matrix)

[[[1.         0.09230769]
  [1.         0.09230769]
  [1.         0.09230769]
  ...
  [1.         0.09230769]
  [1.         0.09230769]
  [1.         0.09230769]]

 [[1.         0.09230769]
  [1.         0.09230769]
  [1.         0.09230769]
  ...
  [1.         0.09230769]
  [1.         0.09230769]
  [0.83076923 0.09230769]]

 [[1.         0.09230769]
  [1.         0.09230769]
  [1.         0.09230769]
  ...
  [1.         0.09230769]
  [0.83076923 0.09230769]
  [0.83076923 0.09230769]]

 ...

 [[1.         0.09230769]
  [1.         0.09230769]
  [1.         0.09230769]
  ...
  [1.         0.09230769]
  [1.         0.09230769]
  [1.         0.09230769]]

 [[1.         0.09230769]
  [1.         0.09230769]
  [1.         0.09230769]
  ...
  [1.         0.09230769]
  [1.         0.09230769]
  [1.         0.09230769]]

 [[1.         0.09230769]
  [1.         0.09230769]
  [1.         0.09230769]
  ...
  [1.         0.09230769]
  [1.         0.09230769]
  [1.         0.09230769]]]
[[[0.83076

In [34]:
inputs, labels = next(iter(train_dataloader))
[batch_size, step_size, fea_size] = inputs.size()
input_dim = fea_size
hidden_dim = fea_size
output_dim = fea_size

ValueError: not enough values to unpack (expected 3, got 2)

In [20]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x23d9678cac0>

In [21]:
valid_dataloader

<torch.utils.data.dataloader.DataLoader at 0x23d9678c760>