In [1]:
import glob
import pandas as pd

# 1. Build a mapping for the columns with mixed types
dtype_map = {
    'Day'               :'string',
    'timezone'          :'string',
    'truck'             :'string',
    'Role'              :'string',
    'Transfer unit'     :'string',
    'bitrate_unit'      :'string',
    'cwnd_unit'         :'string',
    'Role-RX'           :'string',
    'Transfer unit-RX'  :'string',
    'bitrate_unit-RX'   :'string',
    'square_id'         :'string',
}

# 2. Reload all files with low_memory=False
file_paths = glob.glob('data/*-combined-kml.csv')
df = pd.concat([
    pd.read_csv(p, dtype=dtype_map, low_memory=False)
    for p in file_paths
], ignore_index=True)


numeric_cols = [
    'time','Year','Month','Date','hour','min','sec',
    'latitude','longitude','speed',
    'svr1','svr2','svr3','svr4',
    'Transfer size','Bitrate','Retransmissions','CWnd',
    'Transfer size-RX','Bitrate-RX','send_data'
]
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

print(df.dtypes)


time                         int64
Day                 string[python]
Year                       float64
Month                      float64
Date                       float64
hour                       float64
min                        float64
sec                        float64
timezone            string[python]
latitude                   float64
longitude                  float64
speed                      float64
truck               string[python]
svr1                       float64
svr2                       float64
svr3                       float64
svr4                       float64
Role                string[python]
Transfer size              float64
Transfer unit       string[python]
Bitrate                    float64
bitrate_unit        string[python]
Retransmissions            float64
CWnd                       float64
cwnd_unit           string[python]
Role-RX             string[python]
Transfer size-RX           float64
Transfer unit-RX    string[python]
Bitrate-RX          

In [3]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3042495 entries, 0 to 3042494
Data columns (total 32 columns):
 #   Column            Dtype  
---  ------            -----  
 0   time              int64  
 1   Day               string 
 2   Year              float64
 3   Month             float64
 4   Date              float64
 5   hour              float64
 6   min               float64
 7   sec               float64
 8   timezone          string 
 9   latitude          float64
 10  longitude         float64
 11  speed             float64
 12  truck             string 
 13  svr1              float64
 14  svr2              float64
 15  svr3              float64
 16  svr4              float64
 17  Role              string 
 18  Transfer size     float64
 19  Transfer unit     string 
 20  Bitrate           float64
 21  bitrate_unit      string 
 22  Retransmissions   float64
 23  CWnd              float64
 24  cwnd_unit         string 
 25  Role-RX           string 
 26  Transfer size-

Unnamed: 0,time,Day,Year,Month,Date,hour,min,sec,timezone,latitude,...,Retransmissions,CWnd,cwnd_unit,Role-RX,Transfer size-RX,Transfer unit-RX,Bitrate-RX,bitrate_unit-RX,send_data,square_id
0,1656876656,,,,,,,,,,...,,,,,,,,,,
1,1656876656,,,,,,,,,,...,,,,,,,,,,
2,1656876656,,,,,,,,,,...,,,,,,,,,,
3,1656876657,Mon,2022.0,7.0,4.0,5.0,30.0,57.0,AEST,99.0,...,,,,,,,,,0.0,
4,1656876657,Mon,2022.0,7.0,4.0,5.0,30.0,57.0,AEST,99.0,...,,,,,,,,,0.0,


In [4]:
df_copy = df.copy()

In [64]:
df = df_copy.copy()

In [67]:
df['datetime'] = pd.to_datetime(df['time'], unit='s')
for c in ['latitude', 'longitude']:
    df.loc[df[c] == 999, c] = pd.NA
df.dropna(subset=['latitude', 'longitude'], inplace=True)

df.datetime = df.datetime.dt.tz_localize('UTC').dt.tz_convert('Australia/Melbourne')

In [68]:
for c in ['svr1', 'svr2', 'svr3', 'svr4']:
    df.loc[df[c] == 1000, c] = pd.NA
df.dropna(subset=['svr1', 'svr2', 'svr3', 'svr4'], inplace=True)

In [69]:
to_drop = ['time','Year', 'timezone', 'sec', 'Role', 'Month', 'Role-RX' , 'Transfer unit-RX', 'cwnd_unit',  'bitrate_unit', 'Transfer unit', 'bitrate_unit-RX', 'Day']

df.drop(columns=to_drop, inplace=True)

In [70]:
df.set_index('datetime', inplace=True)

In [71]:
df.dropna(how='any', inplace=True)

In [72]:
df.reset_index()

Unnamed: 0,datetime,Date,hour,min,latitude,longitude,speed,truck,svr1,svr2,svr3,svr4,Transfer size,Bitrate,Retransmissions,CWnd,Transfer size-RX,Bitrate-RX,send_data,square_id
0,2022-07-04 05:43:37+10:00,4.0,5.0,43.0,-37.737985,144.849691,76.811700,garbo02,25.5,28.9,27.8,44.5,0.864258,7.25,0.0,0.007080,0.988281,8.29,0.000000,square_94489280583
1,2022-07-04 05:43:38+10:00,4.0,5.0,43.0,-37.738127,144.849794,75.519004,garbo02,35.0,32.0,31.3,42.1,1.300000,10.90,3.0,0.010645,0.972656,8.16,0.000000,square_94489280583
2,2022-07-04 05:43:39+10:00,4.0,5.0,43.0,-37.738249,144.849904,72.846568,garbo02,34.2,30.5,29.8,46.1,1.300000,10.90,0.0,0.010645,0.957031,8.03,0.000000,square_94489280583
3,2022-07-04 05:43:40+10:00,4.0,5.0,43.0,-37.738362,144.850020,70.279696,garbo02,25.0,23.2,21.6,38.7,0.864258,7.25,0.0,0.007080,0.953125,7.99,0.014404,square_94489280583
4,2022-07-04 05:43:41+10:00,4.0,5.0,43.0,-37.738491,144.850142,70.153760,garbo02,20.2,18.7,23.4,34.2,0.926758,7.77,6.0,0.007588,0.927734,7.78,0.000000,square_94489280583
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1549826,2022-07-07 10:21:57+10:00,7.0,10.0,21.0,-37.745164,144.809922,5.393024,garbo06,304.0,308.0,301.0,324.0,1.250000,10.50,1.0,0.010254,2.080000,17.40,0.000000,square_85899346000
1549827,2022-07-07 10:21:58+10:00,7.0,10.0,21.0,-37.745170,144.809933,4.978176,garbo06,232.0,230.0,228.0,245.0,1.250000,10.50,0.0,0.010254,1.440000,12.10,0.000000,square_85899346000
1549828,2022-07-07 10:22:00+10:00,7.0,10.0,22.0,-37.745228,144.809942,8.911824,garbo06,168.0,166.0,159.0,190.0,1.250000,10.50,4.0,0.010254,1.540000,12.90,0.014404,square_85899346000
1549829,2022-07-07 10:22:01+10:00,7.0,10.0,22.0,-37.745238,144.809933,7.274656,garbo06,281.0,272.0,271.0,288.0,2.500000,21.00,0.0,0.020508,1.610000,13.50,0.000000,square_85899346000


In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1549831 entries, 2022-07-04 05:43:37+10:00 to 2022-07-07 10:22:02+10:00
Data columns (total 19 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   Date              1549831 non-null  float64
 1   hour              1549831 non-null  float64
 2   min               1549831 non-null  float64
 3   latitude          1549831 non-null  float64
 4   longitude         1549831 non-null  float64
 5   speed             1549831 non-null  float64
 6   truck             1549831 non-null  string 
 7   svr1              1549831 non-null  float64
 8   svr2              1549831 non-null  float64
 9   svr3              1549831 non-null  float64
 10  svr4              1549831 non-null  float64
 11  Transfer size     1549831 non-null  float64
 12  Bitrate           1549831 non-null  float64
 13  Retransmissions   1549831 non-null  float64
 14  CWnd              1549831 non-null  float64
 15  Tran

In [None]:
df.to_csv('Cleaned.csv')

In [80]:
import pandas as pd

# this will one‑hot encode both columns and append the new dummy columns
df_encoded = pd.get_dummies(
    df,
    columns=['square_id', 'truck'],
    prefix=['square', 'truck'],
    drop_first=False,      
    dtype='uint8'            # smaller integer dtype to save memory
)

# inspect the new column names
print(df_encoded.columns.tolist())


['Date', 'hour', 'min', 'latitude', 'longitude', 'speed', 'svr1', 'svr2', 'svr3', 'svr4', 'Transfer size', 'Bitrate', 'Retransmissions', 'CWnd', 'Transfer size-RX', 'Bitrate-RX', 'send_data', 'square_square_103079215126', 'square_square_103079215128', 'square_square_103079215130', 'square_square_103079215132', 'square_square_103079215134', 'square_square_103079215136', 'square_square_103079215138', 'square_square_103079215140', 'square_square_103079215142', 'square_square_103079215144', 'square_square_103079215147', 'square_square_103079215151', 'square_square_103079215153', 'square_square_103079215155', 'square_square_103079215157', 'square_square_103079215158', 'square_square_103079215180', 'square_square_103079215182', 'square_square_103079215184', 'square_square_103079215186', 'square_square_103079215188', 'square_square_111669149696', 'square_square_111669149698', 'square_square_111669149700', 'square_square_111669149702', 'square_square_111669149706', 'square_square_111669149708'

In [82]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1549831 entries, 2022-07-04 05:43:37+10:00 to 2022-07-07 10:22:02+10:00
Columns: 317 entries, Date to truck_garbo11
dtypes: float64(17), uint8(300)
memory usage: 656.2 MB
