In [75]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split, TensorDataset
from tqdm import tqdm

In [3]:
df = pd.DataFrame()
files = os.listdir('../EV_data/eved_dataset/data/eVED')
for file in files:
    df_tmp = pd.read_csv('../EV_data/eved_dataset/data/eVED/' + file, low_memory=False)
    df = pd.concat((df,df_tmp),ignore_index=True)

In [104]:
ev_list   = [10,455,541]
phev_list = [9,11,371,379,388,398,417,431,443,449,453,457,492,497,536,537,542,545,550,554,560,561,567,569]
icv_list  = [2,7,8,12,108,110,116,119,120,123,126,128,129,130,131,132,133,135,137,138,139,140,141,142,143,
             145,147,148,149,153,154,155,156,157,159,160,161,162,163,164,165,167,169,172,174,176,179,180,181,
             184,185,187,189,190,191,192,193,195,196,199,200,202,203,205,206,207,208,209,211,213,214,215,216,
             217,218,222,223,225,228,230,232,233,234,235,237,238,240,243,244,246,247,248,249,250,251,252,254,
             255,257,259,260,262,264,265,266,267,268,269,270,271,273,274,275,276,278,280,282,283,285,286,288,
             289,291,293,297,299,300,301,302,303,304,306,307,308,309,311,312,313,315,318,319,321,323,324,325,
             326,329,330,332,333,334,337,340,345,354,356,359,366,367,370,380,394,400,401,409,413,414,415,416,
             426,429,432,433,434,454,459,460,461,462,463,464,465,466,467,469,470,472,473,476,478,480,482,483,
             484,485,486,487,488,489,490,494,498,500,501,502,503,504,505,506,507,516,517,519,521,522,527,528,
             529,530,531,533,534,535,538,539,540,546,547,548,552,557,562,563,571,575,576,577,578,580,581,584,
             587,588,591,592,595,596,597,598,599,600,601,602,603,604,606,607,608,609,616,618,624,625,630]
hev_list   = [5,115,124,125,150,201,212,220,231,241,242,258,272,292,298,328,338,344,346,347,348,349,350,351,353,
              355,357,360,368,369,372,374,375,376,378,381,382,383,384,385,386,387,389,392,393,397,399,402,403,404,
              405,406,407,410,411,418,422,428,430,435,436,437,438,439,440,441,444,445,447,448,450,451,452,456,458,
              468,474,475,477,526,532,543,549,555,558,564,565,566,573,574,579,605,610]

In [150]:
daynum_withoat    = np.unique(df.loc[~pd.isna(df['OAT[DegC]']),"DayNum"].values)
daynum_withoutoat = np.unique(df.loc[pd.isna(df['OAT[DegC]']),"DayNum"].values)
daynum_oat_dict = {}
for daynum_tmp in tqdm(daynum_withoutoat):
    closest = daynum_withoat[np.abs(daynum_withoat  - daynum_tmp).argmin()]
    temperature = df.loc[df['DayNum']==closest,'OAT[DegC]'].values[0]
    daynum_oat_dict[daynum_tmp] = temperature

100%|████████████████████████████████████████████████████████| 14716/14716 [01:40<00:00, 146.20it/s]


In [159]:
df['OAT[DegC]'] = df.apply(lambda x: daynum_oat_dict[x['DayNum']] if x['DayNum'] in daynum_oat_dict else x['OAT[DegC]'],axis=1)

In [162]:
def process_trip(group):
    if group['HV Battery SOC[%]'].max() == 0:
        return pd.DataFrame(columns=group.columns)

    first_nonzero_idx = group[group['HV Battery SOC[%]'] != 0].index.min()

    group = group.loc[first_nonzero_idx:].reset_index(drop=True)

    group['Timestamp(ms)'] -= group['Timestamp(ms)'].iloc[0]

    return group

def check_and_fix_timeidx(df):
    for trip, group in df.groupby('Trip'):
        time_indices = group['Timeidx'].sort_values().values
        
        if time_indices[0] != 0 or any(time_indices[i+1] - time_indices[i] != 1 
                                     for i in range(len(time_indices)-1)):
            
#             print(f"Trip {trip} has discontinuous time indices - fixing...")
            
            expected_seq = range(int(time_indices[-1]) + 1)
            missing = set(expected_seq) - set(time_indices)
            
            for idx in sorted(missing):
                prev_row = group[group['Timeidx'] < idx].iloc[-1] if any(group['Timeidx'] < idx) else None
                next_row = group[group['Timeidx'] > idx].iloc[0] if any(group['Timeidx'] > idx) else None
                if prev_row is not None and next_row is not None:
                    new_row = (prev_row + next_row) / 2
                elif prev_row is not None:
                    new_row = prev_row.copy()
                else:
                    new_row = next_row.copy()
                
                new_row['Timeidx'] = idx
                df = pd.concat([df, new_row.to_frame().T])
    df = df.sort_values(['Trip', 'Timeidx']).reset_index(drop=True)
    return df

In [163]:
time_series_x = []
time_series_y = []

sequence_length = 100 # unit: second, use_defined_parameters
useful_features = ['VehId','Trip','Timestamp(ms)','Latitude[deg]','Longitude[deg]','Vehicle Speed[km/h]',
                  'OAT[DegC]','Elevation Smoothed[m]','Energy_Consumption','Class of Speed Limit']

# for phev and ev

for vehid_tmp in tqdm(phev_list+ev_list):
    df_specific = df.loc[df['VehId']==vehid_tmp,:].copy()
    df_processed = df_specific.groupby('Trip', group_keys=False).apply(process_trip) # only for electric vehicles
    df_processed.sort_values(['Trip','Timestamp(ms)'],inplace=True)
    df_processed = df_processed[useful_features]
    
    df_processed['Timeidx'] = df_processed['Timestamp(ms)'] // 1000
    df_processed.drop_duplicates(subset=['VehId','Trip','Timeidx'],keep='first',inplace=True)
    df_processed = check_and_fix_timeidx(df_processed)

    trip_list = np.unique(df_processed['Trip'].values)
    for trip_tmp in tqdm(trip_list):
        df_tmp = df_processed.loc[df_processed['Trip']== trip_tmp,:].copy()
        df_tmp.reset_index(drop=True,inplace=True)
        total_seconds = df_tmp.shape[0]
        if total_seconds < sequence_length: # discard this trip
            continue
        
        for idx in range(0,total_seconds-sequence_length,60): # two consecutive samples are 60 seconds apart
            df_slice = df_tmp.iloc[idx:idx+sequence_length,:]
            x_slice  = df_slice[['VehId','Vehicle Speed[km/h]','OAT[DegC]','Elevation Smoothed[m]',
                                 'Class of Speed Limit','Latitude[deg]','Longitude[deg]']].values
            y_slice  = df_slice['Energy_Consumption'].sum()*3600*(1/3600) # unit: kWh
            time_series_x.append(x_slice)
            time_series_y.append(y_slice)

  df_processed = df_specific.groupby('Trip', group_keys=False).apply(process_trip) # only for electric vehicles

100%|████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 530.75it/s][A
  df_processed = df_specific.groupby('Trip', group_keys=False).apply(process_trip) # only for electric vehicles
  df_processed = df_specific.groupby('Trip', group_keys=False).apply(process_trip) # only for electric vehicles

  0%|                                                                       | 0/148 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████████████████████| 148/148 [00:00<00:00, 912.50it/s][A
  df_processed = df_specific.groupby('Trip', group_keys=False).apply(process_trip) # only for electric vehicles
  df_processed = df_specific.groupby('Trip', group_keys=False).apply(process_trip) # only for electric vehicles

  0%|                                                                       | 0/386 [00:00<?, ?it/s][A
 13%|███████▉        


100%|█████████████████████████████████████████████████████████████| 83/83 [00:00<00:00, 1071.96it/s][A
  df_processed = df_specific.groupby('Trip', group_keys=False).apply(process_trip) # only for electric vehicles
  df_processed = df_specific.groupby('Trip', group_keys=False).apply(process_trip) # only for electric vehicles

100%|█████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 1124.28it/s][A
  df_processed = df_specific.groupby('Trip', group_keys=False).apply(process_trip) # only for electric vehicles
  df_processed = df_specific.groupby('Trip', group_keys=False).apply(process_trip) # only for electric vehicles

100%|██████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 914.45it/s][A
  df_processed = df_specific.groupby('Trip', group_keys=False).apply(process_trip) # only for electric vehicles
  df_processed = df_specific.groupby('Trip', group_keys=False).apply(process_trip) # only for electric vehicles

  0%|       


100%|██████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 429.44it/s][A
  df_processed = df_specific.groupby('Trip', group_keys=False).apply(process_trip) # only for electric vehicles
  df_processed = df_specific.groupby('Trip', group_keys=False).apply(process_trip) # only for electric vehicles

100%|█████████████████████████████████████████████████████████████| 85/85 [00:00<00:00, 1510.49it/s][A
  df_processed = df_specific.groupby('Trip', group_keys=False).apply(process_trip) # only for electric vehicles
  df_processed = df_specific.groupby('Trip', group_keys=False).apply(process_trip) # only for electric vehicles

  0%|                                                                       | 0/192 [00:00<?, ?it/s][A
 46%|████████████████████████████▎                                | 89/192 [00:00<00:00, 882.08it/s][A
100%|████████████████████████████████████████████████████████████| 192/192 [00:00<00:00, 826.83it/s][A
  df_processed = df_specific.

 13%|███████▊                                                     | 46/362 [00:00<00:01, 225.83it/s][A
 21%|████████████▊                                                | 76/362 [00:00<00:01, 257.64it/s][A
 28%|████████████████▉                                           | 102/362 [00:00<00:01, 232.55it/s][A
 35%|████████████████████▉                                       | 126/362 [00:00<00:01, 234.54it/s][A
 45%|██████████████████████████▊                                 | 162/362 [00:00<00:00, 274.15it/s][A
 53%|███████████████████████████████▋                            | 191/362 [00:00<00:00, 276.29it/s][A
 60%|████████████████████████████████████▎                       | 219/362 [00:00<00:00, 268.38it/s][A
 68%|████████████████████████████████████████▉                   | 247/362 [00:00<00:00, 247.92it/s][A
 76%|█████████████████████████████████████████████▍              | 274/362 [00:01<00:00, 249.06it/s][A
 83%|█████████████████████████████████████████████████▋         

100%|███████████████████████████████████████████████████████████████| 27/27 [08:41<00:00, 19.31s/it]


In [164]:
time_series_x = np.array(time_series_x) # vehid, speed, temperature, elevation, class of speed limit, latitude, longitude
time_series_y = np.array(time_series_y) # energy consumption for this segment

In [165]:
np.savez('timeseries_dataset_phev_ev.npz', X=time_series_x, y=time_series_y)

In [166]:
time_series_x.shape

(33596, 100, 7)

In [168]:
time_series_x_gas = []
time_series_y_gas = []

sequence_length = 100 # unit: second, use_defined_parameters
useful_features = ['VehId','Trip','Timestamp(ms)','Latitude[deg]','Longitude[deg]','Vehicle Speed[km/h]',
                  'OAT[DegC]','Elevation Smoothed[m]','Energy_Consumption','Class of Speed Limit']

# for icv and hev

for vehid_tmp in tqdm(icv_list+hev_list):
    df_processed = df.loc[df['VehId']==vehid_tmp,:].copy()
    df_processed.sort_values(['Trip','Timestamp(ms)'],inplace=True)
    df_processed = df_processed[useful_features]
    
    df_processed['Timeidx'] = df_processed['Timestamp(ms)'] // 1000
    df_processed.drop_duplicates(subset=['VehId','Trip','Timeidx'],keep='first',inplace=True)
    df_processed = check_and_fix_timeidx(df_processed)

    trip_list = np.unique(df_processed['Trip'].values)
    for trip_tmp in trip_list:
        df_tmp = df_processed.loc[df_processed['Trip']== trip_tmp,:].copy()
        df_tmp.reset_index(drop=True,inplace=True)
        total_seconds = df_tmp.shape[0]
        if total_seconds < sequence_length: # discard this trip
            continue
        
        for idx in range(0,total_seconds-sequence_length,60): # two consecutive samples are 60 seconds apart
            df_slice = df_tmp.iloc[idx:idx+sequence_length,:]
            x_slice  = df_slice[['VehId','Vehicle Speed[km/h]','OAT[DegC]','Elevation Smoothed[m]',
                                 'Class of Speed Limit','Latitude[deg]','Longitude[deg]']].values
            y_slice  = df_slice['Energy_Consumption'].sum()*3600*(1/3600) # unit: kWh
            time_series_x_gas.append(x_slice)
            time_series_y_gas.append(y_slice)

100%|█████████████████████████████████████████████████████████████| 357/357 [48:29<00:00,  8.15s/it]


In [169]:
time_series_x_gas = np.array(time_series_x_gas)
time_series_y_gas = np.array(time_series_y_gas)

time_series_x_gas.shape


(216114, 100, 7)

In [170]:
np.savez('timeseries_dataset_icv_hev.npz', X=time_series_x_gas, y=time_series_y_gas)