In [1]:
"""
This file should handle the raw csv of pkg10 produced by VTD
"""
import pandas as pd
import numpy as np
import csv

In [2]:
"""
All constants defined here, and we only need to modify here in the future, hopefully.
"""


# A
DATA_DIR = "../0809/2_24/"
SCENE = "2_24"
PKG_ID = '10'
FILE_PATH = DATA_DIR + SCENE + "pkg{}.csv".format(PKG_ID)

OUT_DIR = "../processed0809/"
OUTPUT_PATH = OUT_DIR+SCENE+"pkg{}_del.csv".format(PKG_ID)

# B
# The only things we need to keep are lightmask and steering of ego (main car, id=1)
# Don't forget simFrame, used for index and merge
NUM_CARS = 4+1


# B
# We want playerId, lightMask, steering
# which are in col index 2,4
index_to_keep = [2,3,0]
PATTERN = 4  # number of entries of one obj, which form a pattern
col_of_one = ["playerId", "lightMask", "steering"]

PLACE_HOLDER = -99.99

In [3]:
with open(FILE_PATH, 'r', encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    MAX_LEN = max(len(_) for _ in reader)
    
MAX_LEN

15

In [4]:
original_col_names = ["temp.{}".format(i) for i in range(MAX_LEN)]
df10 = pd.read_csv(FILE_PATH, names=original_col_names, low_memory=False)        

df10.drop(index=0, axis=0, inplace=True)
df10.index -= 1 
df10.columns = [name.strip() for name in df10.columns]
      
df10.tail(10)

Unnamed: 0,temp.0,temp.1,temp.2,temp.3,temp.4,temp.5,temp.6,temp.7,temp.8,temp.9,temp.10,temp.11,temp.12,temp.13,temp.14
5811,58.11999870091677,5813,2,0x400,-5.64500514883548e-05,0.0,3,0x400,0.0005064042052254081,0.0,1,0,0.0,0.0,
5812,58.12999870069325,5814,2,0x400,-5.6468099501216784e-05,0.0,3,0x400,0.0005536981043405831,0.0,1,0,0.0,0.0,
5813,58.139998700469725,5815,2,0x400,-5.648614387609996e-05,0.0,3,0x400,0.0005992371588945389,0.0,1,0,0.0,0.0,
5814,58.149998700246215,5816,2,0x400,-5.6504188250983134e-05,0.0,3,0x400,0.0006430678186006844,0.0,1,0,0.0,0.0,
5815,58.1599987000227,5817,2,0x400,-5.652223626384512e-05,0.0,3,0x400,0.000685235601849854,0.0,1,0,0.0,0.0,
5816,58.16999869979919,5818,2,0x400,-5.654028063872829e-05,0.0,3,0x400,0.0007257848628796637,0.0,1,0,0.0,0.0,
5817,58.17999869957566,5819,2,0x400,-5.655832865159027e-05,0.0,3,0x400,0.0007647589663974941,0.0,1,0,0.0,0.0,
5818,58.18999869935215,5820,2,0x400,-5.657637302647345e-05,0.0,3,0x400,0.0008022002293728291,0.0,1,0,0.0,0.0,
5819,58.199998699128635,5821,2,0x400,-5.659442103933543e-05,0.0,3,0x400,0.0008381499210372566,0.0,1,0,0.0,0.0,
5820,58.20999869890511,5822,2,0x400,-5.661246541421861e-05,0.0,3,0x400,0.0008726483210921286,0.0,1,0,0.0,,


In [5]:
rows = df10.shape[0]
num_del = rows%100
df10.drop(labels=range(rows-num_del, rows), axis=0, inplace=True)
df10.shape

(5800, 15)

In [6]:
"""
Other constants which must be defined after reading in df.


Note: when determing number of objs, we use (#cols-2)/PATTERN.
    But quite often, the raw data has an empty column in the end,
    with pandas automatically giving a name "Unnamed.0"
"""
# C
while (len(df10.columns)-2)%PATTERN != 0:
    # del that col
    print("Found an empty col at the end")
    df10.drop(df10.columns[-1], axis=1, inplace=True)
    
    
NUM_OBJS = (len(df10.columns)-2)//PATTERN
BEGIN_IDX = NUM_OBJS 
TARGET = 10 # we want 10
END_IDX = 10

# Check if it is a whole number
NUM_OBJS

Found an empty col at the end


3

In [7]:
def map_light(x):
    if type(x)==str and len(x) >= 2:
        if x[-2] == str(2):
            return 1
        elif x[-2] == str(4):
            return 2
    return 0


In [8]:
"""### Keep those cols we want and check ###"""

# a list of column names to delete
cols_to_del = [name for i, name in enumerate(df10.columns) if i==0 or (i%PATTERN not in index_to_keep and i!=1)]
df10 = df10.drop(columns=cols_to_del)
df10.head(10)

Unnamed: 0,temp.1,temp.2,temp.3,temp.4,temp.6,temp.7,temp.8,temp.10,temp.11,temp.12
0,2,2,0x400,3.273796392022632e-05,3,0x400,6.280963134486228e-06,1,0,0.0
1,3,2,0x400,9.472153760725632e-05,3,0x400,1.7716480215312913e-05,1,0,0.0
2,4,2,0x400,9.470872464589775e-05,3,0x400,1.7714050045469776e-05,1,0,0.0
3,5,2,0x400,9.46905929595232e-05,3,0x400,1.7710608517518267e-05,1,0,0.0
4,6,2,0x400,9.466714254813267e-05,3,0x400,1.770615745044779e-05,1,0,0.0
5,7,2,0x400,9.46383734117262e-05,3,0x400,1.7700696844258346e-05,1,0,0.0
6,8,2,0x400,9.460430010221897e-05,3,0x400,1.7694228517939337e-05,1,0,0.0
7,9,2,0x400,9.456491534365341e-05,3,0x400,1.7686754290480167e-05,1,0,0.0
8,10,2,0x400,9.452024823985994e-05,3,0x400,1.7678274161880836e-05,1,0,0.0
9,11,2,0x400,9.447029151488096e-05,3,0x400,1.766879177012015e-05,1,0,0.0


In [9]:
"""### do some simple pre-process and check ###"""
df10.fillna(PLACE_HOLDER, inplace=True)
df10.rename(columns={"temp.1":"simFrame"}, inplace=True)
df10["simFrame"] = df10["simFrame"].astype(np.int64)
df10.set_index("simFrame", inplace=True)
df10 = df10[~df10.index.duplicated(keep='first')]

new_names = ["{}.{}".format(name,i) for i in range(NUM_OBJS) for name in col_of_one]
df10.columns = new_names

for i in range(NUM_OBJS):
    df10["{}.{}".format(col_of_one[0], i)] = df10["{}.{}".format(col_of_one[0], i)].apply(lambda x: int(x.strip()) if type(x)==str else x)
    print( df10["{}.{}".format(col_of_one[0], i)].value_counts())

# map lightMask
for i in range(NUM_OBJS):
    df10["lightMask.{}".format(i)] = df10["lightMask.{}".format(i)].apply(map_light)

# append empty colunms to match designated number of lines reserved: 14
"""
Same work, detail explained in pkg7 file
"""
for i in range(BEGIN_IDX, END_IDX):
    for name in col_of_one:
        df10[name+'.'+str(i)] = PLACE_HOLDER
        
        
NUM_OBJS = END_IDX

print(df10.shape)
df10.head(10)

2    5800
Name: playerId.0, dtype: int64
3    5800
Name: playerId.1, dtype: int64
1    5800
Name: playerId.2, dtype: int64
(5800, 30)


Unnamed: 0_level_0,playerId.0,lightMask.0,steering.0,playerId.1,lightMask.1,steering.1,playerId.2,lightMask.2,steering.2,playerId.3,...,steering.6,playerId.7,lightMask.7,steering.7,playerId.8,lightMask.8,steering.8,playerId.9,lightMask.9,steering.9
simFrame,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,2,0,3.273796392022632e-05,3,0,6.280963134486228e-06,1,0,0.0,-99.99,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
3,2,0,9.472153760725632e-05,3,0,1.7716480215312913e-05,1,0,0.0,-99.99,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
4,2,0,9.470872464589775e-05,3,0,1.7714050045469776e-05,1,0,0.0,-99.99,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
5,2,0,9.46905929595232e-05,3,0,1.7710608517518267e-05,1,0,0.0,-99.99,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
6,2,0,9.466714254813267e-05,3,0,1.770615745044779e-05,1,0,0.0,-99.99,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
7,2,0,9.46383734117262e-05,3,0,1.7700696844258346e-05,1,0,0.0,-99.99,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
8,2,0,9.460430010221897e-05,3,0,1.7694228517939337e-05,1,0,0.0,-99.99,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
9,2,0,9.456491534365341e-05,3,0,1.7686754290480167e-05,1,0,0.0,-99.99,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
10,2,0,9.452024823985994e-05,3,0,1.7678274161880836e-05,1,0,0.0,-99.99,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
11,2,0,9.447029151488096e-05,3,0,1.766879177012015e-05,1,0,0.0,-99.99,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99


In [10]:
for i in range(END_IDX):
    if df10["{}.{}".format(col_of_one[0], i)].dtype == np.object:
        df10["playerId.{}".format(i)] = df10["playerId.{}".format(i)].apply(lambda x: x.strip()).apply(lambda x: int(x) if x else PLACE_HOLDER)

In [11]:
"""
MOST important task: Standardization.

Details in handling pkg9 code
    
    
!!! Take quite a long time
"""
row_nums = df10.shape[0]    # 4760 this time
jump = len(col_of_one)   # 2

for row in range(row_nums):
    old_row = df10.iloc[row]
    new_row = [ [PLACE_HOLDER]*jump ]*NUM_OBJS

    
    # NOTICE: We only have 4 distinct playerId 1-4, but the raw data have 5 chunks
    # Checking raw data, we find there are duplicate chunks
    for idx in range(0, 0 + NUM_OBJS*jump, jump):   # 0, 14, 28, ...
        objId = old_row[idx]
        if objId == PLACE_HOLDER:
            # Special case where id is placeholder -99.99 (also those trailing data)
            continue
        else:
            objId = int(objId)
            # ego car id=1 should go to objectId_0, id=5 should go to objectId=4 likewise.
            new_row[objId] = old_row[idx:idx+jump]
            
    # flat the list
    new_row = [item for sublist in new_row for item in sublist]
    df10.iloc[row] = new_row

              
df10.head(10)

Unnamed: 0_level_0,playerId.0,lightMask.0,steering.0,playerId.1,lightMask.1,steering.1,playerId.2,lightMask.2,steering.2,playerId.3,...,steering.6,playerId.7,lightMask.7,steering.7,playerId.8,lightMask.8,steering.8,playerId.9,lightMask.9,steering.9
simFrame,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,-99.99,-99.99,-99.99,1.0,0.0,0.0,2.0,0.0,3.273796392022632e-05,3.0,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
3,-99.99,-99.99,-99.99,1.0,0.0,0.0,2.0,0.0,9.472153760725632e-05,3.0,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
4,-99.99,-99.99,-99.99,1.0,0.0,0.0,2.0,0.0,9.470872464589775e-05,3.0,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
5,-99.99,-99.99,-99.99,1.0,0.0,0.0,2.0,0.0,9.46905929595232e-05,3.0,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
6,-99.99,-99.99,-99.99,1.0,0.0,0.0,2.0,0.0,9.466714254813267e-05,3.0,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
7,-99.99,-99.99,-99.99,1.0,0.0,0.0,2.0,0.0,9.46383734117262e-05,3.0,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
8,-99.99,-99.99,-99.99,1.0,0.0,0.0,2.0,0.0,9.460430010221897e-05,3.0,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
9,-99.99,-99.99,-99.99,1.0,0.0,0.0,2.0,0.0,9.456491534365341e-05,3.0,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
10,-99.99,-99.99,-99.99,1.0,0.0,0.0,2.0,0.0,9.452024823985994e-05,3.0,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
11,-99.99,-99.99,-99.99,1.0,0.0,0.0,2.0,0.0,9.447029151488096e-05,3.0,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99


In [12]:
for i in range(NUM_OBJS):
    print( df10["{}.{}".format(col_of_one[0], i)].value_counts())

-99.99    5800
Name: playerId.0, dtype: int64
1.0    5800
Name: playerId.1, dtype: int64
2.0    5800
Name: playerId.2, dtype: int64
3.0    5800
Name: playerId.3, dtype: int64
-99.99    5800
Name: playerId.4, dtype: int64
-99.99    5800
Name: playerId.5, dtype: int64
-99.99    5800
Name: playerId.6, dtype: int64
-99.99    5800
Name: playerId.7, dtype: int64
-99.99    5800
Name: playerId.8, dtype: int64
-99.99    5800
Name: playerId.9, dtype: int64


In [13]:
# do the check after the sort, there should be no output if it's correct
for i in range(END_IDX):
    assert len(df10["{}.{}".format(col_of_one[0], i)].value_counts()) <= 2, "playerId.{} col still has mixed id, need to further enlarge TARGET {}".format(i, TARGET)

In [14]:
new_names = ["{}_{}".format(name,i) for i in range(NUM_OBJS) for name in col_of_one]
df10.columns = new_names 
df10.columns

Index(['playerId_0', 'lightMask_0', 'steering_0', 'playerId_1', 'lightMask_1',
       'steering_1', 'playerId_2', 'lightMask_2', 'steering_2', 'playerId_3',
       'lightMask_3', 'steering_3', 'playerId_4', 'lightMask_4', 'steering_4',
       'playerId_5', 'lightMask_5', 'steering_5', 'playerId_6', 'lightMask_6',
       'steering_6', 'playerId_7', 'lightMask_7', 'steering_7', 'playerId_8',
       'lightMask_8', 'steering_8', 'playerId_9', 'lightMask_9', 'steering_9'],
      dtype='object')

In [15]:
df10.to_csv(OUTPUT_PATH)
print("Normal work for pkg 10 ends here.")

Normal work for pkg 10 ends here.


In [None]:
print("The following 3 cells is used only when we just need the road sign whose id==0. \n")

In [None]:
"""
Formalize for general case.

We we are sure:
1. We only need simFrame as index col, (steering and lightmask) of car whose id=1
2. lightmask col is in a fixed place relative to id col (right after id), so does steering

But in a general scenario, what we aren't sure is that:
1. which cols of playerID.x those id=1 go to
2. In a single playerID.x col, there could be different id, i.e. those id=1 in diff rows go to diff cols

Idea:
Check each row,
    Check each col of playerID.x
        if id==1, get the lightmask and steering next to it, store
        
"""

"""
# Prepare, define in B
lightMask_col = []
steering_col = []
row_num = df10.shape[0]    # Should be 8335 this time
df10.rename(columns={
    "playerId":"playerId.0",
    "lightMask":"lightMask.0",
    "steering": "steering.0",
    "steeringWheelTorque": "steeringWheelTorque"

}, inplace=True)   # format col name for loop
NUM_CARS   # number of cars currently in csv, which decides how many and which playerId.x cols to check

got = False
# Start to loop
for row in range(row_num):
    this_row = df10.iloc[row]
    got = False
    for i in range(NUM_CARS):
        if this_row["playerId.{}".format(i)] == 1:
            lightMask_col.append(this_row["lightMask.{}".format(i)])
            steering_col.append(this_row["steering.{}".format(i)])
            got = True
            break
    if not got:
        # Strange thing found: some rows (<100 out of 8000+) have no data of ego car
        lightMask_col.append(PLACE_HOLDER)
        steering_col.append(PLACE_HOLDER)
        # print("row num: ", row)

# IMPORTANT: since each row must have ego car data, they should match
print(len(lightMask_col), len(steering_col), row_num)
"""

In [None]:
"""
df10_new = pd.DataFrame({
    'simFrame': df10['simFrame'],
    'light': lightMask_col,
    'steering': steering_col
})
df10_new.set_index('simFrame', inplace=True)
df10_new
"""

In [None]:
# df10_new.to_csv(OUTPUT_PATH)