In [1]:
"""
This file should handle the raw csv of pkg7 produced by VTD
"""
import pandas as pd
import numpy as np
import csv

In [26]:
"""
All constants independent of df shape defined here.
"""


# A
DATA_DIR = "../raw_data/0810/0810_2_24/"
SCENE = "2_24"
PKG_ID = '7'
FILE_PATH = DATA_DIR + SCENE + "pkg{}.csv".format(PKG_ID)

OUT_DIR = "../processed/processed_0810/"
OUTPUT_PATH = OUT_DIR+SCENE+"pkg{}_del.csv".format(PKG_ID)

# B
# We want id, type, color, and lateralDist
# which are in col index 3, 16, 17, 6
index_to_keep = [0,1,3,6]
PATTERN = 16  # number of entries of one road line, which form a pattern
col_of_one = ["id", "lateralDist", "type", "color"]

PLACE_HOLDER = -99.99

In [27]:
with open(FILE_PATH, 'r', encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    MAX_LEN = max(len(_) for _ in reader)
    
MAX_LEN

131

In [28]:
original_col_names = ["temp.{}".format(i) for i in range(MAX_LEN)]
df7 = pd.read_csv(FILE_PATH, names=original_col_names, low_memory=False)        

df7.drop(index=0, axis=0, inplace=True)
df7.index -= 1
df7.columns = [name.strip() for name in df7.columns]
      
df7.head(10)

Unnamed: 0,temp.0,temp.1,temp.2,temp.3,temp.4,temp.5,temp.6,temp.7,temp.8,temp.9,...,temp.121,temp.122,temp.123,temp.124,temp.125,temp.126,temp.127,temp.128,temp.129,temp.130
0,0.009999999776482582,2,1,0,-1,-1,1.7746027708053589,-0.0045266817323863515,-4.167760873222489e-07,4.919840096968035e-10,...,,,,,,,,,,
1,0.019999999552965164,3,1,0,-1,-1,1.7746027708053589,-0.0045266817323863515,-4.1677608867960153e-07,4.919840266374626e-10,...,,,,,,,,,,
2,0.029999999329447743,4,1,0,-1,-1,1.7746027708053589,-0.0045266817323863515,-4.1677609044268685e-07,4.91984066342132e-10,...,,,,,,,,,,
3,0.03999999910593033,5,1,0,-1,-1,1.7746027708053589,-0.0045266817323863515,-4.167760924993877e-07,4.919841092231748e-10,...,,,,,,,,,,
4,0.04999999888241291,6,1,0,-1,-1,1.7746027708053589,-0.0045266817323863515,-4.1677609470579954e-07,4.919841743388328e-10,...,,,,,,,,,,
5,0.059999998658895486,7,1,0,-1,-1,1.7746027708053589,-0.0045266817323863515,-4.167760969023586e-07,4.919841865149314e-10,...,,,,,,,,,,
6,0.06999999843537807,8,1,0,-1,-1,1.7746027708053589,-0.0045266817323863515,-4.167760989301745e-07,4.919842521599848e-10,...,,,,,,,,,,
7,0.07999999821186066,9,1,0,-1,-1,1.7746027708053589,-0.0045266817323863515,-4.16776100645319e-07,4.919842960998188e-10,...,,,,,,,,,,
8,0.08999999798834325,10,1,0,-1,-1,1.7746027708053589,-0.0045266817323863515,-4.1677610192980733e-07,4.919843379220706e-10,...,,,,,,,,,,
9,0.09999999776482582,11,1,0,-1,-1,1.7746027708053589,-0.0045266817323863515,-4.167761026989012e-07,4.919843638624547e-10,...,,,,,,,,,,


In [29]:
rows = df7.shape[0]
num_del = rows%100
df7.drop(labels=range(rows-num_del, rows), axis=0, inplace=True)
df7.shape

(4800, 131)

In [30]:
"""
Other constants which must be defined after reading in df.


Note: when determing number of objs, we use (#cols-2)/PATTERN.
    But quite often, the raw data has an empty column in the end,
    with pandas automatically giving a name "Unnamed.0"
"""
# C
while (len(df7.columns)-2)%PATTERN != 0:
    # del that col
    print("Found an empty col at the end")
    df7.drop(df7.columns[-1], axis=1, inplace=True)

NUM_OBJS = (len(df7.columns)-2)//PATTERN
BEGIN_IDX = NUM_OBJS # current df has 10 roadlines, 0-9
TARGET = 14 # we want 14
END_IDX = 14

# Check if it is a whole number
NUM_OBJS

Found an empty col at the end


8

In [31]:
"""### Keep those cols we want and check ###"""

# a list of column names to delete
cols_to_del = [name for i, name in enumerate(df7.columns) if i%PATTERN not in index_to_keep and i!=1]
df7 = df7.drop(columns=cols_to_del)

df7.head(10)


Unnamed: 0,temp.0,temp.1,temp.3,temp.6,temp.16,temp.17,temp.19,temp.22,temp.32,temp.33,...,temp.96,temp.97,temp.99,temp.102,temp.112,temp.113,temp.115,temp.118,temp.128,temp.129
0,0.009999999776482582,2,0,1.7746027708053589,2,1,2,-1.6003972291946411,1,1,...,,,,,,,,,,
1,0.019999999552965164,3,0,1.7746027708053589,2,1,2,-1.6003972291946411,1,1,...,,,,,,,,,,
2,0.029999999329447743,4,0,1.7746027708053589,2,1,2,-1.6003972291946411,1,1,...,,,,,,,,,,
3,0.03999999910593033,5,0,1.7746027708053589,2,1,2,-1.6003972291946411,1,1,...,,,,,,,,,,
4,0.04999999888241291,6,0,1.7746027708053589,2,1,2,-1.6003972291946411,1,1,...,,,,,,,,,,
5,0.059999998658895486,7,0,1.7746027708053589,2,1,2,-1.6003972291946411,1,1,...,,,,,,,,,,
6,0.06999999843537807,8,0,1.7746027708053589,2,1,2,-1.6003972291946411,1,1,...,,,,,,,,,,
7,0.07999999821186066,9,0,1.7746027708053589,2,1,2,-1.6003972291946411,1,1,...,,,,,,,,,,
8,0.08999999798834325,10,0,1.7746027708053589,2,1,2,-1.6003972291946411,1,1,...,,,,,,,,,,
9,0.09999999776482582,11,0,1.7746027708053589,2,1,2,-1.6003972291946411,1,1,...,,,,,,,,,,


In [32]:
"""### do some simple pre-process and check ###"""


df7.fillna(PLACE_HOLDER, inplace=True)
df7.rename(columns={"temp.1":"simFrame"}, inplace=True)
df7["simFrame"] = df7["simFrame"].astype(np.int64)
df7.set_index("simFrame", inplace=True)
df7 = df7[~df7.index.duplicated(keep='first')]


"""
# rename col names of first chunk: id to id.0
rename_dict = {name:name+'.0' for name in col_of_one}
df7.rename(columns=rename_dict, inplace=True)

# rename type.x to roadmark_type.x to distinguish (pkg 9 object also has type)
col_of_one[2] = "roadmark_type"
type_dict = {"type.{}".format(i): "roadmark_type.{}".format(i) for i in range(NUM_OBJS)}
df7.rename(columns=type_dict, inplace=True)
"""

new_names = ['simTime'] + ["{}.{}".format(name,i) for i in range(NUM_OBJS) for name in col_of_one]
df7.columns = new_names


for i in range(NUM_OBJS):
    df7["{}.{}".format(col_of_one[0], i)] = df7["{}.{}".format(col_of_one[0], i)].apply(lambda x: int(x.strip()) if type(x)==str else x)
    print( df7["{}.{}".format(col_of_one[0], i)].value_counts())

# append empty colunms to match designated number of lines reserved: 14
"""
Note, we found that sometimes (could happen pretty often in complex scenarios)
there are more distinct objs than any of the row, or the df, actually holds.
For example, there are 12 cars appeared in the scenario, but at any time, there won't be
more than 10 cars, so the df has only 10 chunks in terms of column numbers.

We need to enlarge it to TARGET capacity and then do sorting, sacraficing some runtime for standardization
"""
for i in range(BEGIN_IDX, END_IDX):
    for name in col_of_one:
        df7[name+'.'+str(i)] = PLACE_HOLDER
        
        
NUM_OBJS = END_IDX

print(df7.shape)
df7.head(10)

0    4800
Name: id.0, dtype: int64
2    2954
1    1846
Name: id.1, dtype: int64
3    1972
2    1846
4     861
6     121
Name: id.2, dtype: int64
 6.00     1861
 3.00     1846
-99.99     982
 4.00      111
Name: id.3, dtype: int64
 4.00     1846
-99.99    1843
 7.00     1111
Name: id.4, dtype: int64
-99.99    4048
 6.00      539
 5.00      213
Name: id.5, dtype: int64
-99.99    4048
 7.00      539
 8.00      213
Name: id.6, dtype: int64
-99.99    4728
 9.00       72
Name: id.7, dtype: int64
(4800, 57)


Unnamed: 0_level_0,simTime,id.0,lateralDist.0,type.0,color.0,id.1,lateralDist.1,type.1,color.1,id.2,...,type.11,color.11,id.12,lateralDist.12,type.12,color.12,id.13,lateralDist.13,type.13,color.13
simFrame,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.009999999776482582,0,1.7746027708053589,2,1,2,-1.6003972291946411,1,1,4,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
3,0.019999999552965164,0,1.7746027708053589,2,1,2,-1.6003972291946411,1,1,4,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
4,0.029999999329447743,0,1.7746027708053589,2,1,2,-1.6003972291946411,1,1,4,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
5,0.03999999910593033,0,1.7746027708053589,2,1,2,-1.6003972291946411,1,1,4,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
6,0.04999999888241291,0,1.7746027708053589,2,1,2,-1.6003972291946411,1,1,4,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
7,0.059999998658895486,0,1.7746027708053589,2,1,2,-1.6003972291946411,1,1,4,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
8,0.06999999843537807,0,1.7746027708053589,2,1,2,-1.6003972291946411,1,1,4,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
9,0.07999999821186066,0,1.7746027708053589,2,1,2,-1.6003972291946411,1,1,4,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
10,0.08999999798834325,0,1.7746027708053589,2,1,2,-1.6003972291946411,1,1,4,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
11,0.09999999776482582,0,1.7746027708053589,2,1,2,-1.6003972291946411,1,1,4,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99


In [33]:
"""
Use sorting algorithm in pkg9 again

Special thing this time:
    Because df7 has simTime col at the start, we need to skip it when doing search and sort
    added [1:] when defining old_row and when put new_row back to df7
"""


row_nums = df7.shape[0]    # 4760 this time
jump = len(col_of_one)   # 4

for row in range(row_nums):
    old_row = df7.iloc[row][1:]
    new_row = [ [PLACE_HOLDER]*jump ]*NUM_OBJS


    for idx in range(0, 0 + NUM_OBJS*jump, jump):   
        objId = old_row[idx]
        if objId == PLACE_HOLDER or objId >= NUM_OBJS or objId < 0:
            # Special case where id is placeholder -99.99 (also those trailing data)
            continue
        else:
            objId = int(objId)
            # ego car id=1 should go to objectId_0, id=5 should go to objectId=4 likewise.
            new_row[objId] = old_row[idx:idx+jump]
            
    # flat the list
    new_row = [item for sublist in new_row for item in sublist]
    df7.iloc[row, 1:] = new_row

              
df7   

Unnamed: 0_level_0,simTime,id.0,lateralDist.0,type.0,color.0,id.1,lateralDist.1,type.1,color.1,id.2,...,type.11,color.11,id.12,lateralDist.12,type.12,color.12,id.13,lateralDist.13,type.13,color.13
simFrame,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,+9.9999997764825821e-03,0,+1.7746027708053589e+00,2,1,-99.99,-99.99,-99.99,-99.99,2.0,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
3,+1.9999999552965164e-02,0,+1.7746027708053589e+00,2,1,-99.99,-99.99,-99.99,-99.99,2.0,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
4,+2.9999999329447746e-02,0,+1.7746027708053589e+00,2,1,-99.99,-99.99,-99.99,-99.99,2.0,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
5,+3.9999999105930328e-02,0,+1.7746027708053589e+00,2,1,-99.99,-99.99,-99.99,-99.99,2.0,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
6,+4.9999998882412910e-02,0,+1.7746027708053589e+00,2,1,-99.99,-99.99,-99.99,-99.99,2.0,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4797,+4.7959998928010464e+01,0,+2.0382518768310547e+00,2,1,1.00,+2.0048882961273193e+00,2,1,2.0,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
4798,+4.7969998927786946e+01,0,+2.0379824638366699e+00,2,1,1.00,+2.0045416355133057e+00,2,1,2.0,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
4799,+4.7979998927563429e+01,0,+2.0377101898193359e+00,2,1,1.00,+2.0041928291320801e+00,2,1,2.0,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
4800,+4.7989998927339911e+01,0,+2.0374360084533691e+00,2,1,1.00,+2.0038423538208008e+00,2,1,2.0,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99


In [34]:
# do the check after the sort, there should be no output if it's correct
for i in range(END_IDX):
    print(df7["{}.{}".format(col_of_one[0], i)].value_counts()) 

0    4800
Name: id.0, dtype: int64
-99.99    2954
 1.00     1846
Name: id.1, dtype: int64
2.0    4800
Name: id.2, dtype: int64
 3.00     3818
-99.99     982
Name: id.3, dtype: int64
 4.00     2818
-99.99    1982
Name: id.4, dtype: int64
-99.99    4587
 5.00      213
Name: id.5, dtype: int64
 6.00     2521
-99.99    2279
Name: id.6, dtype: int64
-99.99    3150
 7.00     1650
Name: id.7, dtype: int64
-99.99    4587
 8.00      213
Name: id.8, dtype: int64
-99.99    4728
 9.00       72
Name: id.9, dtype: int64
-99.99    4800
Name: id.10, dtype: int64
-99.99    4800
Name: id.11, dtype: int64
-99.99    4800
Name: id.12, dtype: int64
-99.99    4800
Name: id.13, dtype: int64


In [35]:
# do the check after the sort, there should be no output if it's correct
for i in range(END_IDX):
    assert len(df7["{}.{}".format(col_of_one[0], i)].value_counts()) <= 2, "id.{} col still has mixed id, need to further enlarge TARGET {}".format(i, TARGET)

In [36]:
new_names = ['simTime']+["{}_{}".format(name,i) for i in range(NUM_OBJS) for name in col_of_one]
df7.columns = new_names 
df7.columns

Index(['simTime', 'id_0', 'lateralDist_0', 'type_0', 'color_0', 'id_1',
       'lateralDist_1', 'type_1', 'color_1', 'id_2', 'lateralDist_2', 'type_2',
       'color_2', 'id_3', 'lateralDist_3', 'type_3', 'color_3', 'id_4',
       'lateralDist_4', 'type_4', 'color_4', 'id_5', 'lateralDist_5', 'type_5',
       'color_5', 'id_6', 'lateralDist_6', 'type_6', 'color_6', 'id_7',
       'lateralDist_7', 'type_7', 'color_7', 'id_8', 'lateralDist_8', 'type_8',
       'color_8', 'id_9', 'lateralDist_9', 'type_9', 'color_9', 'id_10',
       'lateralDist_10', 'type_10', 'color_10', 'id_11', 'lateralDist_11',
       'type_11', 'color_11', 'id_12', 'lateralDist_12', 'type_12', 'color_12',
       'id_13', 'lateralDist_13', 'type_13', 'color_13'],
      dtype='object')

In [37]:
df7.to_csv(OUTPUT_PATH)

In [None]:
# 选文件系统
# 中间完全自动化
# 传final_csv 给windows作评价