In [1]:
"""
This file should handle the raw csv of pkg7 produced by VTD
"""
import pandas as pd
import numpy as np
import csv

In [2]:
"""
All constants defined here, and we only need to modify here in the future, hopefully.
"""


# A
DATA_DIR = "../0809/2_24/"
SCENE = "2_24"
PKG_ID = '20'
FILE_PATH = DATA_DIR + SCENE + "pkg{}.csv".format(PKG_ID)

OUT_DIR = "../processed0809/"
OUTPUT_PATH = OUT_DIR+SCENE+"pkg{}_del.csv".format(PKG_ID)

# B
# We want id, roadDist, value
# which are in col index 2,4,16
index_to_keep = [2,4,16]
PATTERN = 21  # number of entries of one road line, which form a pattern
col_of_one = ["signId", "roadDist", "value"]


# C
BEGIN_IDX = 0+1 # current df has 7 roadlines, 0-6
END_IDX = 14 # we want 14
PLACE_HOLDER = -99.99

In [3]:
with open(FILE_PATH, 'r', encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    MAX_LEN = max(len(_) for _ in reader)
    
MAX_LEN

45

In [4]:
original_col_names = ["temp.{}".format(i) for i in range(MAX_LEN)]
df20 = pd.read_csv(FILE_PATH, names=original_col_names, low_memory=False)        

df20.drop(index=0, axis=0, inplace=True)
df20.index -= 1        
      
df20.tail(10)

Unnamed: 0,temp.0,temp.1,temp.2,temp.3,temp.4,temp.5,temp.6,temp.7,temp.8,temp.9,...,temp.35,temp.36,temp.37,temp.38,temp.39,temp.40,temp.41,temp.42,temp.43,temp.44
5816,58.16999869979919,5818,39,1,59.955657958984375,319.67477426183893,-1.3156363708248011,0.0,-0.0045266817323863515,0.0,...,,,,,,,,,,
5817,58.17999869957566,5819,39,1,60.086238861083984,319.67477426183893,-1.3156363708248011,0.0,-0.0045266817323863515,0.0,...,,,,,,,,,,
5818,58.18999869935215,5820,39,1,60.216827392578125,319.67477426183893,-1.3156363708248011,0.0,-0.0045266817323863515,0.0,...,,,,,,,,,,
5819,58.199998699128635,5821,39,1,60.3474235534668,319.67477426183893,-1.3156363708248011,0.0,-0.0045266817323863515,0.0,...,,,,,,,,,,
5820,58.20999869890511,5822,39,1,60.478023529052734,319.67477426183893,-1.3156363708248011,0.0,-0.0045266817323863515,0.0,...,,,,,,,,,,
5821,58.21999869868159,5823,39,1,60.60863494873047,319.67477426183893,-1.3156363708248011,0.0,-0.0045266817323863515,0.0,...,,,,,,,,,,
5822,58.22999869845808,5824,39,1,60.739253997802734,319.67477426183893,-1.3156363708248011,0.0,-0.0045266817323863515,0.0,...,,,,,,,,,,
5823,58.23999869823456,5825,39,1,60.869876861572266,319.67477426183893,-1.3156363708248011,0.0,-0.0045266817323863515,0.0,...,,,,,,,,,,
5824,58.24999869801104,5826,39,1,61.0005111694336,319.67477426183893,-1.3156363708248011,0.0,-0.0045266817323863515,0.0,...,,,,,,,,,,
5825,58.25999869778752,5827,39,1,61.13115692138672,319.67477426183893,-1.3156363708248011,0.0,-0.0045266817323863515,0.0,...,,,,,,,,,,


In [5]:
rows = df20.shape[0]
num_del = rows%100
df20.drop(labels=range(rows-num_del, rows), axis=0, inplace=True)
df20.shape

(5800, 45)

In [6]:
"""
Other constants which must be defined after reading in df.


Note: when determing number of objs, we use (#cols-2)/PATTERN.
    But quite often, the raw data has an empty column in the end,
    with pandas automatically giving a name "Unnamed.0"
"""
# C
while (len(df20.columns)-2)%PATTERN != 0:
    # del that col
    print("Found an empty col at the end")
    df20.drop(df20.columns[-1], axis=1, inplace=True)
    
    
NUM_OBJS = (len(df20.columns)-2)//PATTERN
BEGIN_IDX = NUM_OBJS # current df has 10 roadlines, 0-9
TARGET = 10 # we want 14
END_IDX = 10

# Check if it is a whole number
NUM_OBJS

Found an empty col at the end


2

In [7]:
"""### Keep those cols we want and check ###"""

# a list of column names to delete
cols_to_del = [name for i, name in enumerate(df20.columns) if i%PATTERN not in index_to_keep and i!=1]
df20 = df20.drop(columns=cols_to_del)

df20.head(10)

Unnamed: 0,temp.1,temp.2,temp.4,temp.16,temp.23,temp.25,temp.37
0,2,38,84.1938247680664,3.5,,,
1,3,38,84.1938247680664,3.5,,,
2,4,38,84.1938247680664,3.5,,,
3,5,38,84.1938247680664,3.5,,,
4,6,38,84.19383239746092,3.5,,,
5,7,38,84.19383239746092,3.5,,,
6,8,38,84.19384002685547,3.5,,,
7,9,38,84.19384002685547,3.5,,,
8,10,38,84.19384765625,3.5,,,
9,11,38,84.19384765625,3.5,,,


In [8]:
"""### do some simple pre-process and check ###"""
df20.fillna(PLACE_HOLDER, inplace=True)
df20.rename(columns={"temp.1":"simFrame"}, inplace=True)
df20.set_index("simFrame", inplace=True)
df20 = df20[~df20.index.duplicated(keep='first')]

"""
# rename col names of first chunk: id to id.0
rename_dict = {name:name+'.0' for name in col_of_one}
df20.rename(columns=rename_dict, inplace=True)

# rename type.x to obj_type.x to distinguish (pkg 7 roadmark also has type)
type_dict = {"type.{}".format(i): "obj_type.{}".format(i) for i in range(NUM_OBJS)}
df20.rename(columns=type_dict, inplace=True)
"""
new_names = ["{}.{}".format(name,i) for i in range(NUM_OBJS) for name in col_of_one]
df20.columns = new_names


print(df20.shape)
df20.head(10)

(5800, 6)


Unnamed: 0_level_0,signId.0,roadDist.0,value.0,signId.1,roadDist.1,value.1
simFrame,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,38,84.1938247680664,3.5,-99.99,-99.99,-99.99
3,38,84.1938247680664,3.5,-99.99,-99.99,-99.99
4,38,84.1938247680664,3.5,-99.99,-99.99,-99.99
5,38,84.1938247680664,3.5,-99.99,-99.99,-99.99
6,38,84.19383239746092,3.5,-99.99,-99.99,-99.99
7,38,84.19383239746092,3.5,-99.99,-99.99,-99.99
8,38,84.19384002685547,3.5,-99.99,-99.99,-99.99
9,38,84.19384002685547,3.5,-99.99,-99.99,-99.99
10,38,84.19384765625,3.5,-99.99,-99.99,-99.99
11,38,84.19384765625,3.5,-99.99,-99.99,-99.99


In [9]:
if list(df20.columns):
    print("HAVE")

HAVE


In [10]:
# Prepare, define in B
id_col = []
roadDist_col = []
value_col = []
row_num = df20.shape[0]    # Should be 4664 this time

if not list(df20.columns):
    # For pkg 20, there could be no valid data at all (empty df with no cols)
    print("Got an empty df")
    id_col = [PLACE_HOLDER]*row_num
    roadDist_col = [PLACE_HOLDER]*row_num
    value_col = [PLACE_HOLDER]*row_num
    
    
else:
    got = False
    # Start to loop
    for row in range(row_num):
        this_row = df20.iloc[row]
        got = False
        for i in range(NUM_OBJS):
            if this_row["signId.{}".format(i)] == 0:
                id_col.append(this_row["signId.{}".format(i)])
                roadDist_col.append(this_row["roadDist.{}".format(i)])
                value_col.append(this_row["value.{}".format(i)])
                got = True
                break
        if not got:
            # Strange thing found: some rows (<100 out of 8000+) have no data of ego car
            id_col.append(PLACE_HOLDER)
            roadDist_col.append(PLACE_HOLDER)
            value_col.append(PLACE_HOLDER)
            # print("row num: ", row)

    # IMPORTANT: since each row must have ego car data, they should match
    print(len(roadDist_col), len(value_col), row_num, len(id_col))

5800 5800 5800 5800


In [11]:
df20_new = pd.DataFrame({
    'simFrame': df20.index,
    "signId": id_col,
    "roadDist": roadDist_col,
    "value": value_col
})
df20_new.set_index('simFrame', inplace=True)
df20_new

Unnamed: 0_level_0,signId,roadDist,value
simFrame,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,-99.99,-99.99,-99.99
3,-99.99,-99.99,-99.99
4,-99.99,-99.99,-99.99
5,-99.99,-99.99,-99.99
6,-99.99,-99.99,-99.99
7,-99.99,-99.99,-99.99
8,-99.99,-99.99,-99.99
9,-99.99,-99.99,-99.99
10,-99.99,-99.99,-99.99
11,-99.99,-99.99,-99.99


In [12]:
# do the check after the sort, there should be no output if it's correct
assert len(df20_new["signId"].value_counts()) <=2, "still has id other than 0 and place_holder"

In [13]:
df20_new.to_csv(OUTPUT_PATH)