In [None]:
"""
FIlE NAME: big_app.ipynb

This is the integreated, web-based app designed to process the raw data (in csv format)
generated by Virtual Test Drive (by VIRES company) RDB Sniffer runtime tool.

Structure:
1. Setup, get working dir by interacting with the user
2. do data process for each raw csv (representing data from one package)
3. ask for an output dir and join those csv (currently 5) to a big csv
4. setup socket config and send the big csv to a remote server (another computer)
"""
print("Application Starting")

In [None]:
# Setup voila, which turns code into app
print("Please ignore following lines, from here...")

!jupyter nbextension enable --py widgetsnbextension --sys-prefix
!jupyter serverextension enable voila --sys-prefix

print("to here. The app is setting up its configuration")

In [None]:
# import necessary modules
print("importing necessary python packages.")
# These are built-in packages in std lib
import sys
import csv
import os
import time
import threading
import socket
try:
    import pandas as pd
    import numpy as np
    import easygui as eg   
except Exception as e:
    print(e)
    print("At least one of the python packages of [pandas, numpy, easygui] is not installed.")
    print("Please use command 'pip install <package-name>' to install your missing package.")
    sys.exit()
else:
    print("sucessfully importing packages")

In [None]:
"""
Set up source dir (storing raw csv) on user's choice
"""

DATA_DIR = eg.diropenbox(title="Choose the folder with the 5 raw csv.", default="../")
print("Make sure that the following files have 5 .csv files ending in pkg_5, pkg_7, pkg_9, pkg_10, pkg_20 accordingly.")
print(os.listdir(DATA_DIR))

In [None]:
""" Global const """
PLACE_HOLDER = -99.99

In [None]:
"""
The following chunks of code process pkg5 data
"""
# Define constants

# Get ___pkg5.csv full path
file_found = False
for f in os.listdir(DATA_DIR):
    if len(f) >= 8 and f[-8:]=="pkg5.csv":
        FILE_PATH = os.path.join(DATA_DIR, f)
        file_found = True
        break
try:
    assert file_found==True, "There is no file ending with pkg5.csv in the folder you choose"
except AssertionError as e:
    print("The app will exit. Please reopen the app and choose the correct folder.")
    time.sleep(3)
    sys.exit()

""" Variables for pkg5 """
# We want playerId and laneId, which are in col index 2,4
index_to_keep = [2,4,6,7,9]
PATTERN = 12  # number of entries of one road line, which form a pattern
col_of_one = ["playerId", "laneId","roadS", "roadT", "hdgRel"]

print(f"Found the csv file {f} in your folder. The process will start now.")

In [None]:
""" read in the csv and preview."""
with open(FILE_PATH, 'r', encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    MAX_LEN = max(len(_) for _ in reader)

    
original_col_names = ["temp.{}".format(i) for i in range(MAX_LEN)]
df5 = pd.read_csv(FILE_PATH, names=original_col_names, low_memory=False)        

df5.drop(index=0, axis=0, inplace=True)
df5.index -= 1
df5.columns = [name.strip() for name in df5.columns]
      
print("Preview the raw csv. The correct column names are not read in because they may not be complete (long enough). Will be fixed later")
print(df5.head(5))

In [None]:
""" Start pre-processing """
# only keep till the last hundreds of rows
# They are safe to and should be deleted because 100 rows <=> 1s in the simulation, and ending row is often incomplete
rows = df5.shape[0]
num_del = rows%100
df5.drop(labels=range(rows-num_del, rows), axis=0, inplace=True)

# Automatically define other const dependent on the dataframe
while (len(df5.columns)-2)%PATTERN != 0:
    # del that col
    print("Found an empty col at the end")
    df5.drop(df5.columns[-1], axis=1, inplace=True)
    
NUM_OBJS = (len(df5.columns)-2)//PATTERN
BEGIN_IDX = NUM_OBJS 
TARGET = 10 # we want 14
END_IDX = 10

# delete unused columns
cols_to_del = [name for i, name in enumerate(df5.columns) if (i%PATTERN not in index_to_keep and i!=1)]
df5 = df5.drop(columns=cols_to_del)


# Some other process
df5.fillna(PLACE_HOLDER, inplace=True)
df5.rename(columns={"temp.1":"simFrame"}, inplace=True)
df5["simFrame"] = df5["simFrame"].astype(np.int64)
df5.set_index("simFrame", inplace=True)
df5 = df5[~df5.index.duplicated(keep='first')]

# replace temp column names with column names we want
new_names = ["{}_{}".format(name,i) for i in range(NUM_OBJS) for name in col_of_one]
df5.columns = new_names

# format all id entries because they server important use later
print(f"Please check the following value count of id in each {col_of_one[0]} column")
print(f"All id values should either be a non-negative whole number (in int or float) or a place holder {PLACE_HOLDER}")
for i in range(NUM_OBJS):
    df5["{}_{}".format(col_of_one[0], i)] = df5["{}_{}".format(col_of_one[0], i)].apply(lambda x: int(x.strip()) if type(x)==str else x)
    print( df5["{}_{}".format(col_of_one[0], i)].value_counts())
    
# append empty colunms to match designated number of lines reserved: 14
for i in range(BEGIN_IDX, END_IDX):
    for name in col_of_one:
        df5[name+'_'+str(i)] = PLACE_HOLDER
               
NUM_OBJS = END_IDX

In [None]:
df5.head(10)

In [None]:
"""
MOST important task: Standardization.

Details explained in the file doc
  
!!! May take quite a long time
"""
print("Sorting started. This may take several seconds up to several minutes, please be patient.")

row_nums = df5.shape[0]    # 4760 this time
jump = len(col_of_one)   # 2

for row in range(row_nums):
    old_row = df5.iloc[row]
    new_row = [ [PLACE_HOLDER]*jump ]*NUM_OBJS

    
    # NOTICE: We only have 4 distinct playerId 1-4, but the raw data have 5 chunks
    # Checking raw data, we find there are duplicate chunks
    for idx in range(0, 0 + NUM_OBJS*jump, jump):   # 0, 14, 28, ...
        objId = old_row[idx]
        if objId == PLACE_HOLDER or objId >= NUM_OBJS or objId < 0:
            # Special case where id is placeholder -99.99 (also those trailing data)
            continue
        else:
            objId = int(objId)
            # ego car id=1 should go to objectId_0, id=5 should go to objectId=4 likewise.
            new_row[objId] = old_row[idx:idx+jump]
            
    # flat the list
    new_row = [item for sublist in new_row for item in sublist]
    df5.iloc[row] = new_row

# do the check after the sort, there should be no output if it's correct
for i in range(END_IDX):
    row_count = dict(df5["{}_{}".format(col_of_one[0], i)].value_counts())
    for key in row_count:
        assert key==PLACE_HOLDER or key==i, f"column {col_of_one[0]}.{i} \
                has wrong id value other than {PLACE_HOLDER} and {i}"
    assert len(row_count) <= 2
    
    
print("Work complete. Proceed to csv of pkg7.")

In [None]:
"""
The following chunks of code process pkg7 data
"""

# Define constants

# Get ___pkg7.csv full path
file_found = False
for f in os.listdir(DATA_DIR):
    if len(f) >= 8 and f[-8:]=="pkg7.csv":
        FILE_PATH = os.path.join(DATA_DIR, f)
        file_found = True
        break
try:
    assert file_found==True, "There is no file ending with pkg7.csv in the folder you choose"
except AssertionError as e:
    print("The app will exit. Please reopen the app and choose the correct folder.")
    time.sleep(3)
    sys.exit()

""" Variables for pkg7 """
# We want id, type, color, and lateralDist, which are in col index 3, 16, 17, 6
index_to_keep = [0,1,3,6]
PATTERN = 16  # number of entries of one road line, which form a pattern
col_of_one = ["id", "lateralDist", "type", "color"]

print(f"Found the csv file {f} in your folder. The process will start now.")

In [None]:
""" read in the csv and preview."""
with open(FILE_PATH, 'r', encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    MAX_LEN = max(len(_) for _ in reader)

    
original_col_names = ["temp.{}".format(i) for i in range(MAX_LEN)]
df7 = pd.read_csv(FILE_PATH, names=original_col_names, low_memory=False)        

df7.drop(index=0, axis=0, inplace=True)
df7.index -= 1
df7.columns = [name.strip() for name in df7.columns]
      
print("Preview the raw csv. The correct column names are not read in because they may not be complete (long enough). Will be fixed later")
print(df7.head(5))

In [None]:
""" Start pre-processing """
# only keep till the last hundreds of rows
# They are safe to and should be deleted because 100 rows <=> 1s in the simulation, and ending row is often incomplete
rows = df7.shape[0]
num_del = rows%100
df7.drop(labels=range(rows-num_del, rows), axis=0, inplace=True)

# Automatically define other const dependent on the dataframe
while (len(df7.columns)-2)%PATTERN != 0:
    # del that col
    print("Found an empty col at the end")
    df7.drop(df7.columns[-1], axis=1, inplace=True)
    
NUM_OBJS = (len(df7.columns)-2)//PATTERN
BEGIN_IDX = NUM_OBJS 
TARGET = 10 # we want 14
END_IDX = 10

# delete unused columns
cols_to_del = [name for i, name in enumerate(df7.columns) if (i%PATTERN not in index_to_keep and i!=1)]
df7 = df7.drop(columns=cols_to_del)


# Some other process
df7.fillna(PLACE_HOLDER, inplace=True)
df7.rename(columns={"temp.1":"simFrame"}, inplace=True)
df7["simFrame"] = df7["simFrame"].astype(np.int64)
df7.set_index("simFrame", inplace=True)
df7 = df7[~df7.index.duplicated(keep='first')]

# replace temp column names with column names we want
new_names = ['simTime'] + ["{}_{}".format(name,i) for i in range(NUM_OBJS) for name in col_of_one]
df7.columns = new_names

# format all id entries because they server important use later
print(f"Please check the following value count of id in each {col_of_one[0]} column")
print(f"All id values should either be a non-negative whole number (in int or float) or a place holder {PLACE_HOLDER}")
for i in range(NUM_OBJS):
    df7["{}_{}".format(col_of_one[0], i)] = df7["{}_{}".format(col_of_one[0], i)].apply(lambda x: int(x.strip()) if type(x)==str else x)
    print( df7["{}_{}".format(col_of_one[0], i)].value_counts())
    
# append empty colunms to match designated number of lines reserved: 14
for i in range(BEGIN_IDX, END_IDX):
    for name in col_of_one:
        df7[name+'_'+str(i)] = PLACE_HOLDER
               
NUM_OBJS = END_IDX

In [None]:
"""
MOST important task: Standardization.

Details explained in the file doc
  
!!! May take quite a long time
"""
print("Sorting started. This may take several seconds up to several minutes, please be patient.")

row_nums = df7.shape[0]    # 4760 this time
jump = len(col_of_one)   # 2

for row in range(row_nums):
    old_row = df7.iloc[row][1:]
    new_row = [ [PLACE_HOLDER]*jump ]*NUM_OBJS

    
    # NOTICE: We only have 4 distinct playerId 1-4, but the raw data have 5 chunks
    # Checking raw data, we find there are duplicate chunks
    for idx in range(0, 0 + NUM_OBJS*jump, jump):   # 0, 14, 28, ...
        objId = old_row[idx]
        if objId == PLACE_HOLDER or objId >= NUM_OBJS or objId < 0:
            # Special case where id is placeholder -99.99 (also those trailing data)
            continue
        else:
            objId = int(objId)
            # ego car id=1 should go to objectId_0, id=5 should go to objectId=4 likewise.
            new_row[objId] = old_row[idx:idx+jump]
            
    # flat the list
    new_row = [item for sublist in new_row for item in sublist]
    df7.iloc[row, 1:] = new_row

# do the check after the sort, there should be no output if it's correct
for i in range(END_IDX):
    row_count = dict(df7["{}_{}".format(col_of_one[0], i)].value_counts())
    for key in row_count:
        assert key==PLACE_HOLDER or key==i, f"column {col_of_one[0]}.{i} \
                has wrong id value other than {PLACE_HOLDER} and {i}"
    assert len(row_count) <= 2
    
    
print("Work complete. Proceed to csv of pkg9.")

In [None]:
"""
The following chunks of code process pkg9 data
"""

# Define constants

# Get ___pkg9.csv full path
file_found = False
for f in os.listdir(DATA_DIR):
    if len(f) >= 8 and f[-8:]=="pkg9.csv":
        FILE_PATH = os.path.join(DATA_DIR, f)
        file_found = True
        break
try:
    assert file_found==True, "There is no file ending with pkg9.csv in the folder you choose"
except AssertionError as e:
    print("The app will exit. Please reopen the app and choose the correct folder.")
    time.sleep(3)
    sys.exit()

"""
Variables for pkg9
# We want objectId, X, Y, yaw_angle(h after X), Vx, Vy, Ax, Ay as well as dimXYZ, offXYZ for all cars
# col index see below (*range(7,13) is 7~12, dimXYZ and offXYZ)
"""
index_to_keep = [2, 4, *range(7,13), 13, 14, 16, 22, 23, 31, 32] # remainder of index//PATTERN
PATTERN = 39  # number of entries of one road line, which form a pattern
col_of_one = ["objectId", 'obj_type','dimX','dimY','dimZ','offX','offY','offZ', \
              "X", "Y", "yaw_angle", "Vx","Vy", "Ax", "Ay"]

print(f"Found the csv file {f} in your folder. The process will start now.")

In [None]:
""" read in the csv and preview."""
with open(FILE_PATH, 'r', encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    MAX_LEN = max(len(_) for _ in reader)

    
original_col_names = ["temp.{}".format(i) for i in range(MAX_LEN)]
df9 = pd.read_csv(FILE_PATH, names=original_col_names, low_memory=False)        

df9.drop(index=0, axis=0, inplace=True)
df9.index -= 1
df9.columns = [name.strip() for name in df9.columns]
      
print("Preview the raw csv. The correct column names are not read in because they may not be complete (long enough). Will be fixed later")
print(df9.head(5))

In [None]:
""" Start pre-processing """
# only keep till the last hundreds of rows
# They are safe to and should be deleted because 100 rows <=> 1s in the simulation, and ending row is often incomplete
rows = df9.shape[0]
num_del = rows%100
df9.drop(labels=range(rows-num_del, rows), axis=0, inplace=True)

# Automatically define other const dependent on the dataframe
while (len(df9.columns)-2)%PATTERN != 0:
    # del that col
    print("Found an empty col at the end")
    df9.drop(df9.columns[-1], axis=1, inplace=True)
    
NUM_OBJS = (len(df9.columns)-2)//PATTERN
BEGIN_IDX = NUM_OBJS 
TARGET = 10 # we want 14
END_IDX = 10

# delete unused columns
cols_to_del = [name for i, name in enumerate(df9.columns) if (i%PATTERN not in index_to_keep and i!=1)]
df9 = df9.drop(columns=cols_to_del)


# Some other process
df9.fillna(PLACE_HOLDER, inplace=True)
df9.rename(columns={"temp.1":"simFrame"}, inplace=True)
df9["simFrame"] = df9["simFrame"].astype(np.int64)
df9.set_index("simFrame", inplace=True)
df9 = df9[~df9.index.duplicated(keep='first')]

# replace temp column names with column names we want
new_names = ["{}_{}".format(name,i) for i in range(NUM_OBJS) for name in col_of_one]
df9.columns = new_names

# format all id entries because they server important use later
print(f"Please check the following value count of id in each {col_of_one[0]} column")
print(f"All id values should either be a non-negative whole number (in int or float) or a place holder {PLACE_HOLDER}")
for i in range(NUM_OBJS):
    df9["{}_{}".format(col_of_one[0], i)] = df9["{}_{}".format(col_of_one[0], i)].apply(lambda x: int(x.strip()) \
                                                                                        if type(x)==str else x)
    print( df9["{}_{}".format(col_of_one[0], i)].value_counts())
    
# append empty colunms to match designated number of lines reserved: 14
for i in range(BEGIN_IDX, END_IDX):
    for name in col_of_one:
        df9[name+'_'+str(i)] = PLACE_HOLDER
               
NUM_OBJS = END_IDX

In [None]:
"""
MOST important task: Standardization.

Details explained in the file doc
  
!!! May take quite a long time
"""
print("Sorting started. This may take several seconds up to several minutes, please be patient.")

row_nums = df9.shape[0]    # 4760 this time
jump = len(col_of_one)   # 2

for row in range(row_nums):
    old_row = df9.iloc[row]
    new_row = [ [PLACE_HOLDER]*jump ]*NUM_OBJS

    
    # NOTICE: We only have 4 distinct playerId 1-4, but the raw data have 5 chunks
    # Checking raw data, we find there are duplicate chunks
    for idx in range(0, 0 + NUM_OBJS*jump, jump):   # 0, 14, 28, ...
        objId = old_row[idx]
        if objId == PLACE_HOLDER or objId >= NUM_OBJS or objId < 0:
            # Special case where id is placeholder -99.99 (also those trailing data)
            continue
        else:
            objId = int(objId)
            # ego car id=1 should go to objectId_0, id=5 should go to objectId=4 likewise.
            new_row[objId] = old_row[idx:idx+jump]
            
    # flat the list
    new_row = [item for sublist in new_row for item in sublist]
    df9.iloc[row] = new_row

# do the check after the sort, there should be no output if it's correct
for i in range(END_IDX):
    row_count = dict(df9["{}_{}".format(col_of_one[0], i)].value_counts())
    for key in row_count:
        assert key==PLACE_HOLDER or key==i, f"column {col_of_one[0]}.{i} \
                has wrong id value other than {PLACE_HOLDER} and {i}"
    assert len(row_count) <= 2
    
    
print("Work complete. Proceed to csv of pkg10.")

In [None]:
"""
The following chunks of code process pkg10 data
"""

# Define constants

# Get ___pkg10.csv full path
file_found = False
for f in os.listdir(DATA_DIR):
    if len(f) >= 9 and f[-9:]=="pkg10.csv":
        FILE_PATH = os.path.join(DATA_DIR, f)
        file_found = True
        break
try:
    assert file_found==True, "There is no file ending with pkg10.csv in the folder you choose"
except AssertionError as e:
    print("The app will exit. Please reopen the app and choose the correct folder.")
    time.sleep(3)
    sys.exit()

""" Variables for pkg10 """
# We want playerId, lightMask, steering
# which are in col index 2,4
index_to_keep = [2,3,0]
PATTERN = 4  # number of entries of one obj, which form a pattern
col_of_one = ["playerId", "lightMask", "steering"]

print(f"Found the csv file {f} in your folder. The process will start now.")

In [None]:
""" read in the csv and preview."""
with open(FILE_PATH, 'r', encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    MAX_LEN = max(len(_) for _ in reader)

    
original_col_names = ["temp.{}".format(i) for i in range(MAX_LEN)]
df10 = pd.read_csv(FILE_PATH, names=original_col_names, low_memory=False)        

df10.drop(index=0, axis=0, inplace=True)
df10.index -= 1
df10.columns = [name.strip() for name in df10.columns]
      
print("Preview the raw csv. The correct column names are not read in because they may not be complete (long enough). Will be fixed later")
print(df10.head(5))

In [None]:
""" Start pre-processing """
# only keep till the last hundreds of rows
# They are safe to and should be deleted because 100 rows <=> 1s in the simulation, and ending row is often incomplete
rows = df10.shape[0]
num_del = rows%100
df10.drop(labels=range(rows-num_del, rows), axis=0, inplace=True)

# Automatically define other const dependent on the dataframe
while (len(df10.columns)-2)%PATTERN != 0:
    # del that col
    print("Found an empty col at the end")
    df10.drop(df10.columns[-1], axis=1, inplace=True)
    
NUM_OBJS = (len(df10.columns)-2)//PATTERN
BEGIN_IDX = NUM_OBJS 
TARGET = 10 # we want 14
END_IDX = 10

# delete unused columns
cols_to_del = [name for i, name in enumerate(df10.columns) if i==0 or (i%PATTERN not in index_to_keep and i!=1)]
df10 = df10.drop(columns=cols_to_del)


# Some other process
df10.fillna(PLACE_HOLDER, inplace=True)
df10.rename(columns={"temp.1":"simFrame"}, inplace=True)
df10["simFrame"] = df10["simFrame"].astype(np.int64)
df10.set_index("simFrame", inplace=True)
df10 = df10[~df10.index.duplicated(keep='first')]

# replace temp column names with column names we want
new_names = ["{}_{}".format(name,i) for i in range(NUM_OBJS) for name in col_of_one]
df10.columns = new_names

# format all id entries because they server important use later
print(f"Please check the following value count of id in each {col_of_one[0]} column")
print(f"All id values should either be a non-negative whole number (in int or float) or a place holder {PLACE_HOLDER}")
for i in range(NUM_OBJS):
    df10["{}_{}".format(col_of_one[0], i)] = df10["{}_{}".format(col_of_one[0], i)].apply(lambda x: int(x.strip()) \
                                                                                        if type(x)==str else x)
    print( df10["{}_{}".format(col_of_one[0], i)].value_counts())
    
# append empty colunms to match designated number of lines reserved: 14
for i in range(BEGIN_IDX, END_IDX):
    for name in col_of_one:
        df10[name+'_'+str(i)] = PLACE_HOLDER
               
NUM_OBJS = END_IDX

In [None]:
df10

In [None]:
"""
MOST important task: Standardization.

Details explained in the file doc
  
!!! May take quite a long time
"""
print("Sorting started. This may take several seconds up to several minutes, please be patient.")

row_nums = df10.shape[0]    # 4760 this time
jump = len(col_of_one)   # 2

for row in range(row_nums):
    old_row = df10.iloc[row]
    new_row = [ [PLACE_HOLDER]*jump ]*NUM_OBJS

    
    # NOTICE: We only have 4 distinct playerId 1-4, but the raw data have 5 chunks
    # Checking raw data, we find there are duplicate chunks
    for idx in range(0, 0 + NUM_OBJS*jump, jump):   # 0, 14, 28, ...
        objId = old_row[idx]
        if objId == PLACE_HOLDER or objId >= NUM_OBJS or objId < 0:
            # Special case where id is placeholder -99.99 (also those trailing data)
            continue
        else:
            objId = int(objId)
            # ego car id=1 should go to objectId_0, id=5 should go to objectId=4 likewise.
            new_row[objId] = old_row[idx:idx+jump]
            
    # flat the list
    new_row = [item for sublist in new_row for item in sublist]
    df10.iloc[row] = new_row

# do the check after the sort, there should be no output if it's correct
for i in range(END_IDX):
    row_count = dict(df10["{}_{}".format(col_of_one[0], i)].value_counts())
    for key in row_count:
        assert key==PLACE_HOLDER or key==i, f"column {col_of_one[0]}.{i} \
                has wrong id value other than {PLACE_HOLDER} and {i}"
    assert len(row_count) <= 2
    
    
print("Work complete. Proceed to csv of pkg10.")

In [None]:
"""
The following chunks of code process pkg20 data
"""

# Define constants

# Get ___pkg20.csv full path
file_found = False
for f in os.listdir(DATA_DIR):
    if len(f) >= 9 and f[-9:]=="pkg20.csv":
        FILE_PATH = os.path.join(DATA_DIR, f)
        file_found = True
        break
try:
    assert file_found==True, "There is no file ending with pkg20.csv in the folder you choose"
except AssertionError as e:
    print("The app will exit. Please reopen the app and choose the correct folder.")
    time.sleep(3)
    sys.exit()

""" Variables for pkg5 """
# We want id, roadDist, value
# which are in col index 2,4,16
index_to_keep = [2,4,16]
PATTERN = 21  # number of entries of one road line, which form a pattern
col_of_one = ["signId", "roadDist", "value"]

print(f"Found the csv file {f} in your folder. The process will start now.")

In [None]:
""" read in the csv and preview."""
with open(FILE_PATH, 'r', encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    MAX_LEN = max(len(_) for _ in reader)

    
original_col_names = ["temp.{}".format(i) for i in range(MAX_LEN)]
df20 = pd.read_csv(FILE_PATH, names=original_col_names, low_memory=False)        

df20.drop(index=0, axis=0, inplace=True)
df20.index -= 1
df20.columns = [name.strip() for name in df20.columns]
      
print("Preview the raw csv. The correct column names are not read in because they may not be complete (long enough). Will be fixed later")
print(df20.head(5))

In [None]:
""" Start pre-processing """
# only keep till the last hundreds of rows
# They are safe to and should be deleted because 100 rows <=> 1s in the simulation, and ending row is often incomplete
rows = df20.shape[0]
num_del = rows%100
df20.drop(labels=range(rows-num_del, rows), axis=0, inplace=True)

# Automatically define other const dependent on the dataframe
while (len(df20.columns)-2)%PATTERN != 0:
    # del that col
    print("Found an empty col at the end")
    df20.drop(df20.columns[-1], axis=1, inplace=True)
    
NUM_OBJS = (len(df20.columns)-2)//PATTERN
BEGIN_IDX = NUM_OBJS 
TARGET = 10 # we want 14
END_IDX = 10

# delete unused columns
cols_to_del = [name for i, name in enumerate(df20.columns) if (i%PATTERN not in index_to_keep and i!=1)]
df20 = df20.drop(columns=cols_to_del)


# Some other process
df20.fillna(PLACE_HOLDER, inplace=True)
df20.rename(columns={"temp.1":"simFrame"}, inplace=True)
df20["simFrame"] = df20["simFrame"].astype(np.int64)
df20.set_index("simFrame", inplace=True)
df20 = df20[~df20.index.duplicated(keep='first')]

# replace temp column names with column names we want
new_names = ["{}_{}".format(name,i) for i in range(NUM_OBJS) for name in col_of_one]
df20.columns = new_names

"""
We have special processing requirement for pkg20.

Only data with playerId == 0 should be kept.
So, we can skip sorting, which aligns all objects in each row based on id values.
Instead, we simply check each row and pick the section with id value==0, or use place holder otherwise.

Further more, original 'playerId', which means which player detect this sign, will be renamed to signId.
Traffic sign isn't given an id in VTD design.
"""
id_col = []
roadDist_col = []
value_col = []
row_nums = df20.shape[0]    # Should be 4664 this time

if not list(df20.columns):
    # For pkg 20, there could be no valid data at all (empty df with no cols)
    print("Got an empty df, a normal case for pkg20")
    id_col = [PLACE_HOLDER]*row_nums
    roadDist_col = [PLACE_HOLDER]*row_nums
    value_col = [PLACE_HOLDER]*row_nums
    
    
else:
    got = False
    # Start to loop
    for row in range(row_nums):
        this_row = df20.iloc[row]
        got = False
        for i in range(NUM_OBJS):
            if this_row["signId_{}".format(i)] == 0:
                id_col.append(this_row["signId_{}".format(i)])
                roadDist_col.append(this_row["roadDist_{}".format(i)])
                value_col.append(this_row["value_{}".format(i)])
                got = True
                break
        if not got:
            # Strange thing found: some rows (<100 out of 8000+) have no data of ego car
            id_col.append(PLACE_HOLDER)
            roadDist_col.append(PLACE_HOLDER)
            value_col.append(PLACE_HOLDER)
            # print("row num: ", row)

    # IMPORTANT: since each row must have ego car data, they should match
    assert len(roadDist_col) == len(value_col) == row_nums == len(id_col), \
            f"the length of some row can't match with original length {row_nums}"
    
    
df20_new = pd.DataFrame({
    'simFrame': df20.index,
    "signId": id_col,
    "roadDist": roadDist_col,
    "value": value_col
})
df20_new.set_index('simFrame', inplace=True)


row_count = dict(df20_new["signId"].value_counts())
for key in row_count:
    assert key==PLACE_HOLDER or key==0, f"column signId \
            has wrong id value other than {PLACE_HOLDER} and 0"
assert len(row_count) <= 2
    
print("Work complete. Proceed to the join of 5 processed csv.")

In [None]:
"""
Join all 5 processed csv together into a final_csv

The order is [df7, df9, df10, df5, df20]
"""

final_df = df7.copy(deep=True)

PKG_ID = [9,10,5,20]
to_join = [df9, df10, df5, df20]
dic = dict(zip(PKG_ID, to_join))
for key, df in dic.items():
    final_df = final_df.join(df, how='inner', rsuffix="_pkg{}".format(key))
    
# Double check, remove duplicate rows (duplicate simTime)
final_df = final_df[~final_df.index.duplicated(keep='first')]
print(f"Final csv complete, with {final_df.shape[0]} row and {final_df.shape[1]} col. Have a preview", end='\n')

print(final_df.head(10))

print("\n Data process done, ready to send to server with socket.")

In [None]:
"""
Use socket to send the final csv from client (this computer) to server (another computer)
"""

print("Client starting......")

# Set up the client
ip = eg.enterbox(
    msg="Please acquire the ip address of server. \n Check with command 'ifconfig' in terminal.",
    title= "Input IP Address of Server",
    default= "169.254.4.254"
)
# ip = "169.254.4.254"
port = 3636
client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

# build the connection
try:
    client.connect((ip, port))
    
    msg = client.recv(1024)
    
    print(msg.decode('utf-8'))
except Exception as e:
    print("Connection failed: ", e)
else:
    # save the csv locally first
    filepath = eg.filesavebox(title="Choose a folder to save the final csv locally.", default="../")
    final_df.to_csv(filepath)
    
    # get file size and name
    filename = filepath.split("/")[-1]
    filesize = os.path.getsize(filepath)
    print("Send {} with filesize {} MBs".format(filename, filesize/1024/1024))
    
    # send size and name first
    client.send(filename.encode())
    time.sleep(1)
    client.send(filesize.to_bytes(filesize.bit_length(), byteorder='big'))
    time.sleep(1)
    
    # Start to send data
    print("Start to send data")
    try:
        start_t = time.time()
        curr_t = time.time()
        with open(filepath, 'rb') as f:
            size = 0
            while 1:
                f_data = f.read(1024)
                
                if f_data:
                    # data transfer not finished yet
                    client.send(f_data)
                    size += len(f_data)
                    if time.time() - start_t == 0:
                        time.sleep(0.5)
                    speed = size/(time.time() - start_t)
                    if time.time() - curr_t >= 0.5:
                        curr_t = time.time()
                        print("Uploading: {}% complete, speed: {} MB/s". format(size/filesize*100, float(size/1024/1024)))
                else:
                    # data transfer complete
                    print("Uploading {} complete".format(filename))
                    break
    except Exception as e:
        print("reading or sending file caused error: \n", e)
finally:
    client.close()
    print("App finished running.")