In [1]:
"""
FIlE NAME: big_app.ipynb

This is the integreated, web-based app designed to process the raw data (in csv format)
generated by Virtual Test Drive (by VIRES company) RDB Sniffer runtime tool.

Structure:
1. Setup, get working dir by interacting with the user
2. do data process for each raw csv (representing data from one package)
3. ask for an output dir and join those csv (currently 5) to a big csv
4. setup socket config and send the big csv to a remote server (another computer)
"""
print("Application Starting")

Application Starting


In [2]:
# import necessary modules
print("importing necessary python packages.")
# These are built-in packages in std lib
import sys
import csv
import os
import time
import threading
import socket
try:
    import pandas as pd
    import numpy as np
    import easygui as eg   
except Exception as e:
    print(e)
    print("At least one of the python packages of [pandas, numpy, easygui] is not installed.")
    print("Please use command 'pip install <package-name>' to install your missing package.")
    sys.exit()
else:
    print("sucessfully importing packages")

importing necessary python packages.
sucessfully importing packages


In [3]:
"""
Set up source dir (storing raw csv) on user's choice
"""

DATA_DIR = eg.diropenbox(title="Choose the folder with the 5 raw csv.", default="../")
print("Make sure that the following files have 5 .csv files ending in pkg_5, pkg_7, pkg_9, pkg_10, pkg_20 accordingly.")
print(os.listdir(DATA_DIR))

Make sure that the following files have 5 .csv files ending in pkg_5, pkg_7, pkg_9, pkg_10, pkg_20 accordingly.
['.DS_Store', '2_24pkg20.csv', '2_24pkg7.csv', '2_24pkg5.csv', '2_24pkg9.csv', '2_24pkg10.csv']


In [4]:
""" Global const """
PLACE_HOLDER = -99.99

In [5]:
"""
The following chunks of code process pkg5 data
"""
# Define constants

# Get ___pkg5.csv full path
file_found = False
for f in os.listdir(DATA_DIR):
    if len(f) >= 8 and f[-8:]=="pkg5.csv":
        FILE_PATH = os.path.join(DATA_DIR, f)
        file_found = True
        break
try:
    assert file_found==True, "There is no file ending with pkg5.csv in the folder you choose"
except AssertionError as e:
    print("The app will exit. Please reopen the app and choose the correct folder.")
    time.sleep(3)
    sys.exit()

""" Variables for pkg5 """
# We want playerId and laneId, which are in col index 2,4
index_to_keep = [2,4,6,7,9]
PATTERN = 12  # number of entries of one road line, which form a pattern
col_of_one = ["playerId", "laneId","roadS", "roadT", "hdgRel"]

print(f"Found the csv file {f} in your folder. The process will start now.")

Found the csv file 2_24pkg5.csv in your folder. The process will start now.


In [6]:
""" read in the csv and preview."""
with open(FILE_PATH, 'r', encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    MAX_LEN = max(len(_) for _ in reader)

    
original_col_names = ["temp.{}".format(i) for i in range(MAX_LEN)]
df5 = pd.read_csv(FILE_PATH, names=original_col_names, low_memory=False)        

df5.drop(index=0, axis=0, inplace=True)
df5.index -= 1
df5.columns = [name.strip() for name in df5.columns]
      
print("Preview the raw csv. The correct column names are not read in because they may not be complete (long enough). Will be fixed later")
print(df5.head(5))

Preview the raw csv. The correct column names are not read in because they may not be complete (long enough). Will be fixed later
                    temp.0                   temp.1                   temp.2  \
0  +9.9999997764825821e-03                        2                        2   
1  +1.9999999552965164e-02                        3                        2   
2  +2.9999999329447746e-02                        4                        2   
3  +3.9999999105930328e-02                        5                        2   
4  +4.9999998882412910e-02                        6                        2   

                    temp.3                   temp.4                   temp.5  \
0                        1                       -2                      0x1   
1                        1                       -2                      0x1   
2                        1                       -2                      0x1   
3                        1                       -2                  

In [7]:
""" Start pre-processing """
# only keep till the last hundreds of rows
# They are safe to and should be deleted because 100 rows <=> 1s in the simulation, and ending row is often incomplete
rows = df5.shape[0]
num_del = rows%100
df5.drop(labels=range(rows-num_del, rows), axis=0, inplace=True)

# Automatically define other const dependent on the dataframe
while (len(df5.columns)-2)%PATTERN != 0:
    # del that col
    print("Found an empty col at the end")
    df5.drop(df5.columns[-1], axis=1, inplace=True)
    
NUM_OBJS = (len(df5.columns)-2)//PATTERN
BEGIN_IDX = NUM_OBJS 
TARGET = 10 # we want 14
END_IDX = 10

# delete unused columns
cols_to_del = [name for i, name in enumerate(df5.columns) if (i%PATTERN not in index_to_keep and i!=1)]
df5 = df5.drop(columns=cols_to_del)


# Some other process
df5.fillna(PLACE_HOLDER, inplace=True)
df5.rename(columns={"temp.1":"simFrame"}, inplace=True)
df5["simFrame"] = df5["simFrame"].astype(np.int64)
df5.set_index("simFrame", inplace=True)
df5 = df5[~df5.index.duplicated(keep='first')]

# replace temp column names with column names we want
new_names = ["{}_{}".format(name,i) for i in range(NUM_OBJS) for name in col_of_one]
df5.columns = new_names

# format all id entries because they server important use later
print(f"Please check the following value count of id in each {col_of_one[0]} column")
print(f"All id values should either be a non-negative whole number (in int or float) or a place holder {PLACE_HOLDER}")
for i in range(NUM_OBJS):
    df5["{}_{}".format(col_of_one[0], i)] = df5["{}_{}".format(col_of_one[0], i)].apply(lambda x: int(x.strip()) if type(x)==str else x)
    print( df5["{}_{}".format(col_of_one[0], i)].value_counts())
    
# append empty colunms to match designated number of lines reserved: 14
for i in range(BEGIN_IDX, END_IDX):
    for name in col_of_one:
        df5[name+'_'+str(i)] = PLACE_HOLDER
               
NUM_OBJS = END_IDX

Found an empty col at the end
Please check the following value count of id in each playerId column
All id values should either be a non-negative whole number (in int or float) or a place holder -99.99
2    4700
Name: playerId_0, dtype: int64
3    3153
1     866
2     681
Name: playerId_1, dtype: int64
 1.00     1926
 2.00     1908
-99.99     866
Name: playerId_2, dtype: int64
-99.99    2792
 1.00     1533
 2.00      375
Name: playerId_3, dtype: int64
-99.99    4325
 1.00      375
Name: playerId_4, dtype: int64


In [8]:
df5.head(10)

Unnamed: 0_level_0,playerId_0,laneId_0,roadS_0,roadT_0,hdgRel_0,playerId_1,laneId_1,roadS_1,roadT_1,hdgRel_1,...,playerId_8,laneId_8,roadS_8,roadT_8,hdgRel_8,playerId_9,laneId_9,roadS_9,roadT_9,hdgRel_9
simFrame,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,2,-2,220.0,-5.134033203125,-0.004028314724564552,3,-1,260.0,-1.6483154296875,-0.0023071265313774347,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
3,2,-2,220.00015258789065,-5.134032726287842,-0.0040283212438225755,3,-1,260.0001525878906,-1.6483153104782104,-0.0023071274627000093,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
4,2,-2,220.0003967285156,-5.134032249450684,-0.004028331022709608,3,-1,260.0003967285156,-1.6483149528503418,-0.002307129092514515,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
5,2,-2,220.0007476806641,-5.1340317726135245,-0.00402834452688694,3,-1,260.0007629394531,-1.6483145952224731,-0.002307131187990308,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
6,2,-2,220.00120544433597,-5.134030818939209,-0.004028361290693283,3,-1,260.0011901855469,-1.6483139991760254,-0.002307133749127388,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
7,2,-2,220.00175476074216,-5.134029865264893,-0.0040283813141286364,3,-1,260.0017395019531,-1.6483134031295776,-0.0023071367759257555,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
8,2,-2,220.0023956298828,-5.134028434753418,-0.00402840506285429,3,-1,260.0024108886719,-1.6483126878738403,-0.0023071409668773413,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
9,2,-2,220.00314331054688,-5.134027004241944,-0.004028432071208954,3,-1,260.0031433105469,-1.6483118534088137,-0.0023071449249982834,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
10,2,-2,220.0039978027344,-5.134025096893311,-0.004028462804853916,3,-1,260.0039978027344,-1.6483107805252075,-0.002307150047272444,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
11,2,-2,220.00494384765625,-5.134023189544678,-0.00402849679812789,3,-1,260.00494384765625,-1.6483097076416016,-0.0023071556352078915,...,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99


In [9]:
"""
MOST important task: Standardization.

Details explained in the file doc
  
!!! May take quite a long time
"""
print("Sorting started. This may take several seconds up to several minutes, please be patient.")

row_nums = df5.shape[0]    # 4760 this time
jump = len(col_of_one)   # 2

for row in range(row_nums):
    old_row = df5.iloc[row]
    new_row = [ [PLACE_HOLDER]*jump ]*NUM_OBJS

    
    # NOTICE: We only have 4 distinct playerId 1-4, but the raw data have 5 chunks
    # Checking raw data, we find there are duplicate chunks
    for idx in range(0, 0 + NUM_OBJS*jump, jump):   # 0, 14, 28, ...
        objId = old_row[idx]
        if objId == PLACE_HOLDER or objId >= NUM_OBJS or objId < 0:
            # Special case where id is placeholder -99.99 (also those trailing data)
            continue
        else:
            objId = int(objId)
            # ego car id=1 should go to objectId_0, id=5 should go to objectId=4 likewise.
            new_row[objId] = old_row[idx:idx+jump]
            
    # flat the list
    new_row = [item for sublist in new_row for item in sublist]
    df5.iloc[row] = new_row

# do the check after the sort, there should be no output if it's correct
for i in range(END_IDX):
    row_count = dict(df5["{}_{}".format(col_of_one[0], i)].value_counts())
    for key in row_count:
        assert key==PLACE_HOLDER or key==i, f"column {col_of_one[0]}.{i} \
                has wrong id value other than {PLACE_HOLDER} and {i}"
    assert len(row_count) <= 2
    
    
print("Work complete. Proceed to csv of pkg7.")

Sorting started. This may take several seconds up to several minutes, please be patient.
Work complete. Proceed to csv of pkg7.


In [10]:
"""
The following chunks of code process pkg7 data
"""

# Define constants

# Get ___pkg7.csv full path
file_found = False
for f in os.listdir(DATA_DIR):
    if len(f) >= 8 and f[-8:]=="pkg7.csv":
        FILE_PATH = os.path.join(DATA_DIR, f)
        file_found = True
        break
try:
    assert file_found==True, "There is no file ending with pkg7.csv in the folder you choose"
except AssertionError as e:
    print("The app will exit. Please reopen the app and choose the correct folder.")
    time.sleep(3)
    sys.exit()

""" Variables for pkg7 """
# We want id, type, color, and lateralDist, which are in col index 3, 16, 17, 6
index_to_keep = [0,1,3,6]
PATTERN = 16  # number of entries of one road line, which form a pattern
col_of_one = ["id", "lateralDist", "type", "color"]

print(f"Found the csv file {f} in your folder. The process will start now.")

Found the csv file 2_24pkg7.csv in your folder. The process will start now.


In [11]:
""" read in the csv and preview."""
with open(FILE_PATH, 'r', encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    MAX_LEN = max(len(_) for _ in reader)

    
original_col_names = ["temp.{}".format(i) for i in range(MAX_LEN)]
df7 = pd.read_csv(FILE_PATH, names=original_col_names, low_memory=False)        

df7.drop(index=0, axis=0, inplace=True)
df7.index -= 1
df7.columns = [name.strip() for name in df7.columns]
      
print("Preview the raw csv. The correct column names are not read in because they may not be complete (long enough). Will be fixed later")
print(df7.head(5))

Preview the raw csv. The correct column names are not read in because they may not be complete (long enough). Will be fixed later
                    temp.0                   temp.1                   temp.2  \
0  +9.9999997764825821e-03                        2                        1   
1  +1.9999999552965164e-02                        3                        1   
2  +2.9999999329447746e-02                        4                        1   
3  +3.9999999105930328e-02                        5                        1   
4  +4.9999998882412910e-02                        6                        1   

                    temp.3                   temp.4                   temp.5  \
0                        0                       -1                       -1   
1                        0                       -1                       -1   
2                        0                       -1                       -1   
3                        0                       -1                  

In [12]:
""" Start pre-processing """
# only keep till the last hundreds of rows
# They are safe to and should be deleted because 100 rows <=> 1s in the simulation, and ending row is often incomplete
rows = df7.shape[0]
num_del = rows%100
df7.drop(labels=range(rows-num_del, rows), axis=0, inplace=True)

# Automatically define other const dependent on the dataframe
while (len(df7.columns)-2)%PATTERN != 0:
    # del that col
    print("Found an empty col at the end")
    df7.drop(df7.columns[-1], axis=1, inplace=True)
    
NUM_OBJS = (len(df7.columns)-2)//PATTERN
BEGIN_IDX = NUM_OBJS 
TARGET = 10 # we want 14
END_IDX = 10

# delete unused columns
cols_to_del = [name for i, name in enumerate(df7.columns) if (i%PATTERN not in index_to_keep and i!=1)]
df7 = df7.drop(columns=cols_to_del)


# Some other process
df7.fillna(PLACE_HOLDER, inplace=True)
df7.rename(columns={"temp.1":"simFrame"}, inplace=True)
df7["simFrame"] = df7["simFrame"].astype(np.int64)
df7.set_index("simFrame", inplace=True)
df7 = df7[~df7.index.duplicated(keep='first')]

# replace temp column names with column names we want
new_names = ['simTime'] + ["{}_{}".format(name,i) for i in range(NUM_OBJS) for name in col_of_one]
df7.columns = new_names

# format all id entries because they server important use later
print(f"Please check the following value count of id in each {col_of_one[0]} column")
print(f"All id values should either be a non-negative whole number (in int or float) or a place holder {PLACE_HOLDER}")
for i in range(NUM_OBJS):
    df7["{}_{}".format(col_of_one[0], i)] = df7["{}_{}".format(col_of_one[0], i)].apply(lambda x: int(x.strip()) if type(x)==str else x)
    print( df7["{}_{}".format(col_of_one[0], i)].value_counts())
    
# append empty colunms to match designated number of lines reserved: 14
for i in range(BEGIN_IDX, END_IDX):
    for name in col_of_one:
        df7[name+'_'+str(i)] = PLACE_HOLDER
               
NUM_OBJS = END_IDX

Found an empty col at the end
Please check the following value count of id in each id column
All id values should either be a non-negative whole number (in int or float) or a place holder -99.99
0    4700
Name: id_0, dtype: int64
2    2932
1    1768
Name: id_1, dtype: int64
4    2019
2    1768
3     753
6     160
Name: id_2, dtype: int64
-99.99    2179
 3.00     1684
 6.00      608
 4.00      229
Name: id_3, dtype: int64
-99.99    2704
 4.00     1550
 7.00      446
Name: id_4, dtype: int64
-99.99    4341
 10.00     359
Name: id_5, dtype: int64
-99.99    4341
 11.00     359
Name: id_6, dtype: int64


In [13]:
"""
MOST important task: Standardization.

Details explained in the file doc
  
!!! May take quite a long time
"""
print("Sorting started. This may take several seconds up to several minutes, please be patient.")

row_nums = df7.shape[0]    # 4760 this time
jump = len(col_of_one)   # 2

for row in range(row_nums):
    old_row = df7.iloc[row][1:]
    new_row = [ [PLACE_HOLDER]*jump ]*NUM_OBJS

    
    # NOTICE: We only have 4 distinct playerId 1-4, but the raw data have 5 chunks
    # Checking raw data, we find there are duplicate chunks
    for idx in range(0, 0 + NUM_OBJS*jump, jump):   # 0, 14, 28, ...
        objId = old_row[idx]
        if objId == PLACE_HOLDER or objId >= NUM_OBJS or objId < 0:
            # Special case where id is placeholder -99.99 (also those trailing data)
            continue
        else:
            objId = int(objId)
            # ego car id=1 should go to objectId_0, id=5 should go to objectId=4 likewise.
            new_row[objId] = old_row[idx:idx+jump]
            
    # flat the list
    new_row = [item for sublist in new_row for item in sublist]
    df7.iloc[row, 1:] = new_row

# do the check after the sort, there should be no output if it's correct
for i in range(END_IDX):
    row_count = dict(df7["{}_{}".format(col_of_one[0], i)].value_counts())
    for key in row_count:
        assert key==PLACE_HOLDER or key==i, f"column {col_of_one[0]}.{i} \
                has wrong id value other than {PLACE_HOLDER} and {i}"
    assert len(row_count) <= 2
    
    
print("Work complete. Proceed to csv of pkg9.")

Sorting started. This may take several seconds up to several minutes, please be patient.
Work complete. Proceed to csv of pkg9.


In [14]:
"""
The following chunks of code process pkg9 data
"""

# Define constants

# Get ___pkg9.csv full path
file_found = False
for f in os.listdir(DATA_DIR):
    if len(f) >= 8 and f[-8:]=="pkg9.csv":
        FILE_PATH = os.path.join(DATA_DIR, f)
        file_found = True
        break
try:
    assert file_found==True, "There is no file ending with pkg9.csv in the folder you choose"
except AssertionError as e:
    print("The app will exit. Please reopen the app and choose the correct folder.")
    time.sleep(3)
    sys.exit()

"""
Variables for pkg9
# We want objectId, X, Y, yaw_angle(h after X), Vx, Vy, Ax, Ay as well as dimXYZ, offXYZ for all cars
# col index see below (*range(7,13) is 7~12, dimXYZ and offXYZ)
"""
index_to_keep = [2, 4, *range(7,13), 13, 14, 16, 22, 23, 31, 32] # remainder of index//PATTERN
PATTERN = 39  # number of entries of one road line, which form a pattern
col_of_one = ["objectId", 'obj_type','dimX','dimY','dimZ','offX','offY','offZ', \
              "X", "Y", "yaw_angle", "Vx","Vy", "Ax", "Ay"]

print(f"Found the csv file {f} in your folder. The process will start now.")

Found the csv file 2_24pkg9.csv in your folder. The process will start now.


In [15]:
""" read in the csv and preview."""
with open(FILE_PATH, 'r', encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    MAX_LEN = max(len(_) for _ in reader)

    
original_col_names = ["temp.{}".format(i) for i in range(MAX_LEN)]
df9 = pd.read_csv(FILE_PATH, names=original_col_names, low_memory=False)        

df9.drop(index=0, axis=0, inplace=True)
df9.index -= 1
df9.columns = [name.strip() for name in df9.columns]
      
print("Preview the raw csv. The correct column names are not read in because they may not be complete (long enough). Will be fixed later")
print(df9.head(5))

Preview the raw csv. The correct column names are not read in because they may not be complete (long enough). Will be fixed later
                    temp.0                   temp.1                   temp.2  \
0  +9.9999997764825821e-03                        2                       +2   
1  +1.9999999552965164e-02                        3                       +2   
2  +2.9999999329447746e-02                        4                       +2   
3  +3.9999999105930328e-02                        5                       +2   
4  +4.9999998882412910e-02                        6                       +2   

                    temp.3                   temp.4                   temp.5  \
0                        1                        1                      0x7   
1                        1                        1                      0x7   
2                        1                        1                      0x7   
3                        1                        1                  

In [16]:
""" Start pre-processing """
# only keep till the last hundreds of rows
# They are safe to and should be deleted because 100 rows <=> 1s in the simulation, and ending row is often incomplete
rows = df9.shape[0]
num_del = rows%100
df9.drop(labels=range(rows-num_del, rows), axis=0, inplace=True)

# Automatically define other const dependent on the dataframe
while (len(df9.columns)-2)%PATTERN != 0:
    # del that col
    print("Found an empty col at the end")
    df9.drop(df9.columns[-1], axis=1, inplace=True)
    
NUM_OBJS = (len(df9.columns)-2)//PATTERN
BEGIN_IDX = NUM_OBJS 
TARGET = 10 # we want 14
END_IDX = 10

# delete unused columns
cols_to_del = [name for i, name in enumerate(df9.columns) if (i%PATTERN not in index_to_keep and i!=1)]
df9 = df9.drop(columns=cols_to_del)


# Some other process
df9.fillna(PLACE_HOLDER, inplace=True)
df9.rename(columns={"temp.1":"simFrame"}, inplace=True)
df9["simFrame"] = df9["simFrame"].astype(np.int64)
df9.set_index("simFrame", inplace=True)
df9 = df9[~df9.index.duplicated(keep='first')]

# replace temp column names with column names we want
new_names = ["{}_{}".format(name,i) for i in range(NUM_OBJS) for name in col_of_one]
df9.columns = new_names

# format all id entries because they server important use later
print(f"Please check the following value count of id in each {col_of_one[0]} column")
print(f"All id values should either be a non-negative whole number (in int or float) or a place holder {PLACE_HOLDER}")
for i in range(NUM_OBJS):
    df9["{}_{}".format(col_of_one[0], i)] = df9["{}_{}".format(col_of_one[0], i)].apply(lambda x: int(x.strip()) \
                                                                                        if type(x)==str else x)
    print( df9["{}_{}".format(col_of_one[0], i)].value_counts())
    
# append empty colunms to match designated number of lines reserved: 14
for i in range(BEGIN_IDX, END_IDX):
    for name in col_of_one:
        df9[name+'_'+str(i)] = PLACE_HOLDER
               
NUM_OBJS = END_IDX

Found an empty col at the end
Found an empty col at the end
Found an empty col at the end
Found an empty col at the end
Found an empty col at the end
Found an empty col at the end
Found an empty col at the end
Found an empty col at the end
Found an empty col at the end
Found an empty col at the end
Found an empty col at the end
Found an empty col at the end
Found an empty col at the end
Found an empty col at the end
Found an empty col at the end
Found an empty col at the end
Found an empty col at the end
Found an empty col at the end
Found an empty col at the end
Found an empty col at the end
Found an empty col at the end
Found an empty col at the end
Please check the following value count of id in each objectId column
All id values should either be a non-negative whole number (in int or float) or a place holder -99.99
2    4700
Name: objectId_0, dtype: int64
3    2757
1    1943
Name: objectId_1, dtype: int64
1     2757
0     1179
38     631
39     133
Name: objectId_2, dtype: int64
0 

In [17]:
"""
MOST important task: Standardization.

Details explained in the file doc
  
!!! May take quite a long time
"""
print("Sorting started. This may take several seconds up to several minutes, please be patient.")

row_nums = df9.shape[0]    # 4760 this time
jump = len(col_of_one)   # 2

for row in range(row_nums):
    old_row = df9.iloc[row]
    new_row = [ [PLACE_HOLDER]*jump ]*NUM_OBJS

    
    # NOTICE: We only have 4 distinct playerId 1-4, but the raw data have 5 chunks
    # Checking raw data, we find there are duplicate chunks
    for idx in range(0, 0 + NUM_OBJS*jump, jump):   # 0, 14, 28, ...
        objId = old_row[idx]
        if objId == PLACE_HOLDER or objId >= NUM_OBJS or objId < 0:
            # Special case where id is placeholder -99.99 (also those trailing data)
            continue
        else:
            objId = int(objId)
            # ego car id=1 should go to objectId_0, id=5 should go to objectId=4 likewise.
            new_row[objId] = old_row[idx:idx+jump]
            
    # flat the list
    new_row = [item for sublist in new_row for item in sublist]
    df9.iloc[row] = new_row

# do the check after the sort, there should be no output if it's correct
for i in range(END_IDX):
    row_count = dict(df9["{}_{}".format(col_of_one[0], i)].value_counts())
    for key in row_count:
        assert key==PLACE_HOLDER or key==i, f"column {col_of_one[0]}.{i} \
                has wrong id value other than {PLACE_HOLDER} and {i}"
    assert len(row_count) <= 2
    
    
print("Work complete. Proceed to csv of pkg10.")

Sorting started. This may take several seconds up to several minutes, please be patient.
Work complete. Proceed to csv of pkg10.


In [18]:
"""
The following chunks of code process pkg10 data
"""

# Define constants

# Get ___pkg10.csv full path
file_found = False
for f in os.listdir(DATA_DIR):
    if len(f) >= 9 and f[-9:]=="pkg10.csv":
        FILE_PATH = os.path.join(DATA_DIR, f)
        file_found = True
        break
try:
    assert file_found==True, "There is no file ending with pkg10.csv in the folder you choose"
except AssertionError as e:
    print("The app will exit. Please reopen the app and choose the correct folder.")
    time.sleep(3)
    sys.exit()

""" Variables for pkg10 """
# We want playerId, lightMask, steering
# which are in col index 2,4
index_to_keep = [2,3,0]
PATTERN = 4  # number of entries of one obj, which form a pattern
col_of_one = ["playerId", "lightMask", "steering"]

print(f"Found the csv file {f} in your folder. The process will start now.")

Found the csv file 2_24pkg10.csv in your folder. The process will start now.


In [19]:
""" read in the csv and preview."""
with open(FILE_PATH, 'r', encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    MAX_LEN = max(len(_) for _ in reader)

    
original_col_names = ["temp.{}".format(i) for i in range(MAX_LEN)]
df10 = pd.read_csv(FILE_PATH, names=original_col_names, low_memory=False)        

df10.drop(index=0, axis=0, inplace=True)
df10.index -= 1
df10.columns = [name.strip() for name in df10.columns]
      
print("Preview the raw csv. The correct column names are not read in because they may not be complete (long enough). Will be fixed later")
print(df10.head(5))

Preview the raw csv. The correct column names are not read in because they may not be complete (long enough). Will be fixed later
                    temp.0                   temp.1                   temp.2  \
0  +9.9999997764825821e-03                        2                        2   
1  +1.9999999552965164e-02                        3                        2   
2  +2.9999999329447746e-02                        4                        2   
3  +3.9999999105930328e-02                        5                        2   
4  +4.9999998882412910e-02                        6                        2   

                    temp.3                   temp.4                   temp.5  \
0                    0x400  +2.3265420168172568e-05  +0.0000000000000000e+00   
1                    0x400  +6.7314351326785982e-05  +0.0000000000000000e+00   
2                    0x400  +6.7305227275937796e-05  +0.0000000000000000e+00   
3                    0x400  +6.7292312451172620e-05  +0.0000000000000

In [20]:
""" Start pre-processing """
# only keep till the last hundreds of rows
# They are safe to and should be deleted because 100 rows <=> 1s in the simulation, and ending row is often incomplete
rows = df10.shape[0]
num_del = rows%100
df10.drop(labels=range(rows-num_del, rows), axis=0, inplace=True)

# Automatically define other const dependent on the dataframe
while (len(df10.columns)-2)%PATTERN != 0:
    # del that col
    print("Found an empty col at the end")
    df10.drop(df10.columns[-1], axis=1, inplace=True)
    
NUM_OBJS = (len(df10.columns)-2)//PATTERN
BEGIN_IDX = NUM_OBJS 
TARGET = 10 # we want 14
END_IDX = 10

# delete unused columns
cols_to_del = [name for i, name in enumerate(df10.columns) if i==0 or (i%PATTERN not in index_to_keep and i!=1)]
df10 = df10.drop(columns=cols_to_del)


# Some other process
df10.fillna(PLACE_HOLDER, inplace=True)
df10.rename(columns={"temp.1":"simFrame"}, inplace=True)
df10["simFrame"] = df10["simFrame"].astype(np.int64)
df10.set_index("simFrame", inplace=True)
df10 = df10[~df10.index.duplicated(keep='first')]

# replace temp column names with column names we want
new_names = ["{}_{}".format(name,i) for i in range(NUM_OBJS) for name in col_of_one]
df10.columns = new_names

# format all id entries because they server important use later
print(f"Please check the following value count of id in each {col_of_one[0]} column")
print(f"All id values should either be a non-negative whole number (in int or float) or a place holder {PLACE_HOLDER}")
for i in range(NUM_OBJS):
    df10["{}_{}".format(col_of_one[0], i)] = df10["{}_{}".format(col_of_one[0], i)].apply(lambda x: int(x.strip()) \
                                                                                        if type(x)==str else x)
    print( df10["{}_{}".format(col_of_one[0], i)].value_counts())
    
# append empty colunms to match designated number of lines reserved: 14
for i in range(BEGIN_IDX, END_IDX):
    for name in col_of_one:
        df10[name+'_'+str(i)] = PLACE_HOLDER
               
NUM_OBJS = END_IDX

Found an empty col at the end
Please check the following value count of id in each playerId column
All id values should either be a non-negative whole number (in int or float) or a place holder -99.99
2    4700
Name: playerId_0, dtype: int64
3    3153
1     866
2     681
Name: playerId_1, dtype: int64
 1.00     1926
 2.00     1908
-99.99     866
Name: playerId_2, dtype: int64
-99.99    2792
 1.00     1533
 2.00      375
Name: playerId_3, dtype: int64
-99.99    4325
 1.00      375
Name: playerId_4, dtype: int64


In [21]:
"""
MOST important task: Standardization.

Details explained in the file doc
  
!!! May take quite a long time
"""
print("Sorting started. This may take several seconds up to several minutes, please be patient.")

row_nums = df10.shape[0]    # 4760 this time
jump = len(col_of_one)   # 2

for row in range(row_nums):
    old_row = df10.iloc[row]
    new_row = [ [PLACE_HOLDER]*jump ]*NUM_OBJS

    
    # NOTICE: We only have 4 distinct playerId 1-4, but the raw data have 5 chunks
    # Checking raw data, we find there are duplicate chunks
    for idx in range(0, 0 + NUM_OBJS*jump, jump):   # 0, 14, 28, ...
        objId = old_row[idx]
        if objId == PLACE_HOLDER or objId >= NUM_OBJS or objId < 0:
            # Special case where id is placeholder -99.99 (also those trailing data)
            continue
        else:
            objId = int(objId)
            # ego car id=1 should go to objectId_0, id=5 should go to objectId=4 likewise.
            new_row[objId] = old_row[idx:idx+jump]
            
    # flat the list
    new_row = [item for sublist in new_row for item in sublist]
    df10.iloc[row] = new_row

# do the check after the sort, there should be no output if it's correct
for i in range(END_IDX):
    row_count = dict(df10["{}_{}".format(col_of_one[0], i)].value_counts())
    for key in row_count:
        assert key==PLACE_HOLDER or key==i, f"column {col_of_one[0]}.{i} \
                has wrong id value other than {PLACE_HOLDER} and {i}"
    assert len(row_count) <= 2
    
    
print("Work complete. Proceed to csv of pkg20.")

Sorting started. This may take several seconds up to several minutes, please be patient.
Work complete. Proceed to csv of pkg20.


In [22]:
"""
The following chunks of code process pkg20 data
"""

# Define constants

# Get ___pkg20.csv full path
file_found = False
for f in os.listdir(DATA_DIR):
    if len(f) >= 9 and f[-9:]=="pkg20.csv":
        FILE_PATH = os.path.join(DATA_DIR, f)
        file_found = True
        break
try:
    assert file_found==True, "There is no file ending with pkg20.csv in the folder you choose"
except AssertionError as e:
    print("The app will exit. Please reopen the app and choose the correct folder.")
    time.sleep(3)
    sys.exit()

""" Variables for pkg20 """
# We want id, playerId, roadDist, x,y type, value
# which are in col index 2,3,4,5,6,14,16
index_to_keep = [2,3,4,5,6,14,16]
PATTERN = 21  # number of entries of one road line, which form a pattern
col_of_one = ["signId", "playerId", "roadDist","sign_X", "sign_Y", "type", "value"]

PLACE_HOLDER = 99.99

print(f"Found the csv file {f} in your folder. The process will start now.")

Found the csv file 2_24pkg20.csv in your folder. The process will start now.


In [23]:
""" read in the csv and preview."""
with open(FILE_PATH, 'r', encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    MAX_LEN = max(len(_) for _ in reader)

    
original_col_names = ["temp.{}".format(i) for i in range(MAX_LEN)]
df20 = pd.read_csv(FILE_PATH, names=original_col_names, low_memory=False)        

df20.drop(index=0, axis=0, inplace=True)
df20.index -= 1
df20.columns = [name.strip() for name in df20.columns]
      
print("Preview the raw csv. The correct column names are not read in because they may not be complete (long enough). Will be fixed later")
print(df20.head(5))

Preview the raw csv. The correct column names are not read in because they may not be complete (long enough). Will be fixed later
                    temp.0                   temp.1                   temp.2  \
0  +9.9999997764825821e-03                        2                       38   
1  +1.9999999552965164e-02                        3                       38   
2  +2.9999999329447746e-02                        4                       38   
3  +3.9999999105930328e-02                        5                       38   
4  +4.9999998882412910e-02                        6                       38   

                    temp.3                   temp.4                   temp.5  \
0                        1  +6.6892501831054688e+01  +6.8376832473271335e+01   
1                        1  +6.6892501831054688e+01  +6.8376832473271335e+01   
2                        1  +6.6892501831054688e+01  +6.8376832473271335e+01   
3                        1  +6.6892501831054688e+01  +6.8376832473271

In [24]:
""" Start pre-processing """
# only keep till the last hundreds of rows
# They are safe to and should be deleted because 100 rows <=> 1s in the simulation, and ending row is often incomplete
rows = df20.shape[0]
num_del = rows%100
df20.drop(labels=range(rows-num_del, rows), axis=0, inplace=True)

# Automatically define other const dependent on the dataframe
while (len(df20.columns)-2)%PATTERN != 0:
    # del that col
    print("Found an empty col at the end")
    df20.drop(df20.columns[-1], axis=1, inplace=True)
    
NUM_OBJS = (len(df20.columns)-2)//PATTERN
BEGIN_IDX = NUM_OBJS 
TARGET = 5 # we want 14
END_IDX = 5

# delete unused columns
cols_to_del = [name for i, name in enumerate(df20.columns) if (i%PATTERN not in index_to_keep and i!=1)]
df20 = df20.drop(columns=cols_to_del)


# Some other process
df20.fillna(PLACE_HOLDER, inplace=True)
df20.rename(columns={"temp.1":"simFrame"}, inplace=True)
df20["simFrame"] = df20["simFrame"].astype(np.int64)
df20.set_index("simFrame", inplace=True)
df20 = df20[~df20.index.duplicated(keep='first')]

# replace temp column names with column names we want
new_names = ["{}_{}".format(name,i) for i in range(NUM_OBJS) for name in col_of_one]
df20.columns = new_names


# format all id entries because they server important use later
print(f"Please check the following value count of id in each {col_of_one[0]} column")
print(f"All id values should either be a non-negative whole number (in int or float) or a place holder {PLACE_HOLDER}")
for i in range(NUM_OBJS):
    df20["{}_{}".format(col_of_one[0], i)] = df20["{}_{}".format(col_of_one[0], i)].apply(lambda x: int(x.strip()) \
                                                                                        if type(x)==str else x)
    df20["{}_{}".format(col_of_one[5], i)] = df20["{}_{}".format(col_of_one[5], i)].apply(lambda x: int(x.strip()) \
                                                                                        if type(x)==str else x)
    print( df20["{}_{}".format(col_of_one[5], i)].value_counts())
    
# append empty colunms to match designated number of lines reserved: 14
for i in range(BEGIN_IDX, END_IDX):
    for name in col_of_one:
        df20[name+'_'+str(i)] = PLACE_HOLDER

        
        
NUM_OBJS = END_IDX

print(df20.shape)
df20.tail(10)

Found an empty col at the end
Please check the following value count of id in each signId column
All id values should either be a non-negative whole number (in int or float) or a place holder 99.99
293    3040
350    1660
Name: type_0, dtype: int64
350    3040
293    1660
Name: type_1, dtype: int64
99.99     3851
293.00     466
350.00     383
Name: type_2, dtype: int64
99.99     4304
350.00     264
293.00     132
Name: type_3, dtype: int64
(4700, 35)


Unnamed: 0_level_0,signId_0,playerId_0,roadDist_0,sign_X_0,sign_Y_0,type_0,value_0,signId_1,playerId_1,roadDist_1,...,sign_Y_3,type_3,value_3,signId_4,playerId_4,roadDist_4,sign_X_4,sign_Y_4,type_4,value_4
simFrame,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4692,39,1,48.98404312133789,302.3744350482484,-1.0446907499186846,293,3.5,0,1,53.174140930175774,...,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99
4693,39,1,48.86061096191406,302.3744350482484,-1.0446907499186846,293,3.5,0,1,53.05087661743164,...,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99
4694,39,1,48.737201690673835,302.3744350482484,-1.0446907499186846,293,3.5,0,1,52.92763900756836,...,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99
4695,39,1,48.61382293701172,302.3744350482484,-1.0446907499186846,293,3.5,0,1,52.8044319152832,...,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99
4696,39,1,48.49047470092773,302.3744350482484,-1.0446907499186846,293,3.5,0,1,52.681255340576165,...,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99
4697,39,1,48.36717224121094,302.3744350482484,-1.0446907499186846,293,3.5,0,1,52.558120727539055,...,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99
4698,39,1,48.24391555786133,302.3744350482484,-1.0446907499186846,293,3.5,0,1,52.435035705566406,...,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99
4699,39,1,48.12071990966797,302.3744350482484,-1.0446907499186846,293,3.5,0,1,52.312007904052734,...,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99
4700,39,1,47.9975814819336,302.3744350482484,-1.0446907499186846,293,3.5,0,1,52.18904113769531,...,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99
4701,39,1,47.874507904052734,302.3744350482484,-1.0446907499186846,293,3.5,0,1,52.06613159179688,...,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99


In [25]:
"""
We have special processing requirement for pkg20.

"""
print("Sorting started. This may take several seconds up to several minutes, please be patient.")

row_nums = df20.shape[0]    # 4760 this time
jump = len(col_of_one)   # 2

for row in range(row_nums):
    old_row = df20.iloc[row]
    new_row = [ [PLACE_HOLDER]*jump ]*NUM_OBJS
    curr_spot = 1  # for ped cross to determine place
    
    """
    In each row, we want [Speed sign]+[ped cross]*n format
    """
    for idx in range(0, 0 + NUM_OBJS*jump, jump):   # 0, 14, 28, ...
        signId = old_row[idx]
        signType = int(old_row[idx+5])
        
        if signType == 293:
            # ped crossing case, several of them can be sensored at one given frame:
            new_row[curr_spot] = list(old_row[idx:idx+jump])
            curr_spot += 1
            
        elif signId == 0:
            # speed limit sign case
            # signId = int(signId)
            # align sign with signId=0 at first
            new_row[0] = list(old_row[idx:idx+jump])
            
    try:
    # sort the list inplace; we use first num of sublist to sort (signId), so don't need key=xxx
        new_row[1:] = sorted(new_row[1:])
    except TypeError as te:
        print(new_row)
        break
    # flat the list
    new_row = [item for sublist in new_row for item in sublist]
    df20.iloc[row] = new_row

# Check
for name in df20.columns:
    if "type" in name:
        assert len( dict(df20[name].value_counts()))<=2
    

# Change place holder back
PLACE_HOLDER = -99.99
print("Work complete. Proceed to the join of 5 processed csv.")

Sorting started. This may take several seconds up to several minutes, please be patient.
Work complete. Proceed to the join of 5 processed csv.


In [26]:
"""
Join all 5 processed csv together into a final_csv

The order is [df7, df9, df10, df5, df20]
"""

final_df = df7.copy(deep=True)

PKG_ID = [9,10,5,20]
to_join = [df9, df10, df5, df20]
dic = dict(zip(PKG_ID, to_join))
for key, df in dic.items():
    final_df = final_df.join(df, how='inner', rsuffix="_pkg{}".format(key))
    
# Double check, remove duplicate rows (duplicate simTime)
final_df = final_df[~final_df.index.duplicated(keep='first')]
print(f"Final csv complete, with {final_df.shape[0]} row and {final_df.shape[1]} col. Have a preview", end='\n')

print("\n Data process done, ready to send to server with socket.")
final_df.head(10)

Final csv complete, with 4700 row and 306 col. Have a preview

 Data process done, ready to send to server with socket.


Unnamed: 0_level_0,simTime,id_0,lateralDist_0,type_0,color_0,id_1,lateralDist_1,type_1,color_1,id_2,...,sign_Y_3,type_3_pkg20,value_3,signId_4,playerId_4_pkg20,roadDist_4,sign_X_4,sign_Y_4,type_4_pkg20,value_4
simFrame,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.009999999776482582,0,1.7746027708053589,2,1,-99.99,-99.99,-99.99,-99.99,2.0,...,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99
3,0.019999999552965164,0,1.7746027708053589,2,1,-99.99,-99.99,-99.99,-99.99,2.0,...,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99
4,0.029999999329447743,0,1.7746027708053589,2,1,-99.99,-99.99,-99.99,-99.99,2.0,...,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99
5,0.03999999910593033,0,1.7746027708053589,2,1,-99.99,-99.99,-99.99,-99.99,2.0,...,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99
6,0.04999999888241291,0,1.7746027708053589,2,1,-99.99,-99.99,-99.99,-99.99,2.0,...,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99
7,0.059999998658895486,0,1.7746027708053589,2,1,-99.99,-99.99,-99.99,-99.99,2.0,...,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99
8,0.06999999843537807,0,1.7746027708053589,2,1,-99.99,-99.99,-99.99,-99.99,2.0,...,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99
9,0.07999999821186066,0,1.7746027708053589,2,1,-99.99,-99.99,-99.99,-99.99,2.0,...,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99
10,0.08999999798834325,0,1.7746027708053589,2,1,-99.99,-99.99,-99.99,-99.99,2.0,...,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99
11,0.09999999776482582,0,1.7746027708053589,2,1,-99.99,-99.99,-99.99,-99.99,2.0,...,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99,99.99


In [27]:
"""
Use socket to send the final csv from client (this computer) to server (another computer)
"""
# save the csv locally first
filepath = eg.filesavebox(title="Choose a folder to save the final csv locally.", default="../")
final_df.to_csv(filepath)
print("final csv has been saved as {}".format(filepath))

final csv has been saved as /Users/thomas/Desktop/data_process/result/result_0819/2-24_final.csv


In [None]:
    
print("Client starting......")

# Set up the client
ip = eg.enterbox(
    msg="Please acquire the ip address of server. \n Check with command 'ifconfig' in terminal.",
    title= "Input IP Address of Server",
    default= "169.254.4.254"
)
# ip = "169.254.4.254"
port = 3636
client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

# build the connection
try:
    client.connect((ip, port))
    
    msg = client.recv(1024)
    
    print(msg.decode('utf-8'))
except Exception as e:
    print("Connection failed: ", e)
else:
    
    # get file size and name
    filename = filepath.split("/")[-1]
    filesize = os.path.getsize(filepath)
    print("Send {} with filesize {} MBs".format(filename, filesize/1024/1024))
    
    # send size and name first
    client.send(filename.encode())
    time.sleep(1)
    client.send(filesize.to_bytes(filesize.bit_length(), byteorder='big'))
    time.sleep(1)
    
    # Start to send data
    print("Start to send data")
    try:
        start_t = time.time()
        curr_t = time.time()
        with open(filepath, 'rb') as f:
            size = 0
            while 1:
                f_data = f.read(1024)
                
                if f_data:
                    # data transfer not finished yet
                    client.send(f_data)
                    size += len(f_data)
                    if time.time() - start_t == 0:
                        time.sleep(0.5)
                    speed = size/(time.time() - start_t)
                    if time.time() - curr_t >= 0.5:
                        curr_t = time.time()
                        print("Uploading: {}% complete, speed: {} MB/s". format(size/filesize*100, float(size/1024/1024)))
                else:
                    # data transfer complete
                    print("Uploading {} complete".format(filename))
                    break
    except Exception as e:
        print("reading or sending file caused error: \n", e)
finally:
    client.close()
    print("App finished running.")