### Setup

In [1]:
# Imports
import os
import pandas as pd
from glob import glob
from functools import reduce
from xml.etree import ElementTree as et
from shutil import move

In [4]:
# Load XML files and store them in a list
xml_files = glob("annotations_data/*.xml")
print(len(xml_files))

2321


### Data Extraction

In [5]:
# Define function to read XML files, extract file name, size (width, height), and object data (name, xmin, xmax, ymin, max)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    # Get file name
    img_name = root.find("filename").text

    # Get size (width and height) of an image
    width = root.find("size").find("width").text
    height = root.find("size").find("height").text

    # Get data of object(s) within an image
    objs = root.findall("object")
    parser = []

    for obj in objs:
        name = obj.find("name").text
        bnd_box = obj.find("bndbox")
        x_min = bnd_box.find("xmin").text
        x_max = bnd_box.find("xmax").text
        y_min = bnd_box.find("ymin").text
        y_max = bnd_box.find("ymax").text
        parser.append([img_name, width, height, name, x_min, x_max, y_min, y_max])

    return parser

In [6]:
# Extract text from all XML files and flatten the resulting list
parser_all = list(map(extract_text, xml_files))
obj_data = reduce(lambda x, y: x + y, parser_all)

### DataFrame Creation & Data Cleaning

In [7]:
# Create the DataFrame, with each entry being a labeled object in an image
df = pd.DataFrame(obj_data, columns = ["file_name", "width", "height", "class", "x_min", "x_max", "y_min", "y_max"])
df

Unnamed: 0,file_name,width,height,class,x_min,x_max,y_min,y_max
0,501cac52-road-work-vertical.jpg,408,612,SpeedLimitSign,299,374,287,357
1,2020-04-04 11:10:21.323753353.jpg,1920,1080,green,624,680,66,123
2,2020-04-04 11:28:23.187401756.jpg,1920,1080,green,703,743,387,404
3,2020-04-04 11:13:23.157114334.jpg,1920,1080,red,774,834,293,322
4,2020-04-04 11:13:23.157114334.jpg,1920,1080,red,854,924,292,325
...,...,...,...,...,...,...,...,...
4024,2020-04-04 11:16:31.256753683.jpg,1920,1080,green,757,800,293,311
4025,2020-04-04 11:16:31.256753683.jpg,1920,1080,red,706,745,292,314
4026,2020-04-04 11:13:23.403026318.jpg,1920,1080,wait_on,783,912,263,284
4027,2020-04-04 11:13:23.403026318.jpg,1920,1080,red,863,922,292,317


In [8]:
# Clean the file_name column
# def clean_filename(filename):
#     return filename.split("-")[-1]

# df["file_name"] = df["file_name"].apply(clean_filename)

In [9]:
df["class"].value_counts()

class
red               1235
green              816
Car                797
Semi               351
wait_on            306
Person             131
SpeedLimitSign     107
StopSign           106
Truck               87
yellow              75
TrafficLight        10
off                  4
TrafficSign          4
Name: count, dtype: int64

### Categories of objects the Model should be able to recognize
1. Person
2. Car
3. Truck
4. Semi
5. RedLight
6. YellowLight
7. GreenLight
8. WaitLight
9. StopSign
10. SpeedLimitSign

In [12]:
# Rename some classes so that they all follow same format (PascalCase)
df["class"] = df["class"].replace({"red": "RedLight", 
                     "yellow": "YellowLight", 
                     "green": "GreenLight", 
                     "wait_on": "WaitLight"})

df["class"].value_counts()

class
RedLight          1235
GreenLight         816
Car                797
Semi               351
WaitLight          306
Person             131
SpeedLimitSign     107
StopSign           106
Truck               87
YellowLight         75
TrafficLight        10
off                  4
TrafficSign          4
Name: count, dtype: int64

In [13]:
# Filter data to get rid of unwanted classes
df = df[(df["class"] != "TrafficLight") & 
        (df["class"] != "TrafficSign") &
        (df["class"] != "off")]

df["class"].value_counts()

class
RedLight          1235
GreenLight         816
Car                797
Semi               351
WaitLight          306
Person             131
SpeedLimitSign     107
StopSign           106
Truck               87
YellowLight         75
Name: count, dtype: int64

In [15]:
df

Unnamed: 0,file_name,width,height,class,x_min,x_max,y_min,y_max
0,501cac52-road-work-vertical.jpg,408,612,SpeedLimitSign,299,374,287,357
1,2020-04-04 11:10:21.323753353.jpg,1920,1080,GreenLight,624,680,66,123
2,2020-04-04 11:28:23.187401756.jpg,1920,1080,GreenLight,703,743,387,404
3,2020-04-04 11:13:23.157114334.jpg,1920,1080,RedLight,774,834,293,322
4,2020-04-04 11:13:23.157114334.jpg,1920,1080,RedLight,854,924,292,325
...,...,...,...,...,...,...,...,...
4024,2020-04-04 11:16:31.256753683.jpg,1920,1080,GreenLight,757,800,293,311
4025,2020-04-04 11:16:31.256753683.jpg,1920,1080,RedLight,706,745,292,314
4026,2020-04-04 11:13:23.403026318.jpg,1920,1080,WaitLight,783,912,263,284
4027,2020-04-04 11:13:23.403026318.jpg,1920,1080,RedLight,863,922,292,317


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4011 entries, 0 to 4028
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   file_name  4011 non-null   object
 1   width      4011 non-null   object
 2   height     4011 non-null   object
 3   class      4011 non-null   object
 4   x_min      4011 non-null   object
 5   x_max      4011 non-null   object
 6   y_min      4011 non-null   object
 7   y_max      4011 non-null   object
dtypes: object(8)
memory usage: 282.0+ KB


In [17]:
# Convert types of width, height, x_min, x_max, y_min, y_max from object to integer
cols = ["width", "height", "x_min", "x_max", "y_min", "y_max"]
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4011 entries, 0 to 4028
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   file_name  4011 non-null   object
 1   width      4011 non-null   int64 
 2   height     4011 non-null   int64 
 3   class      4011 non-null   object
 4   x_min      4011 non-null   int64 
 5   x_max      4011 non-null   int64 
 6   y_min      4011 non-null   int64 
 7   y_max      4011 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 282.0+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[cols] = df[cols].astype(int)


In [18]:
# Prepare columns for YOLO object detection
# Get center x and center y
df["center_x"] = ((df["x_min"] + df["x_max"]) / 2) / df["width"]
df["center_y"] = ((df["y_min"] + df["y_max"]) / 2) / df["height"]

# Get normalized width and height
df["w"] = (df["x_max"] - df["x_min"]) / df["width"]
df["h"] = (df["y_max"] - df["y_min"]) / df["height"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["center_x"] = ((df["x_min"] + df["x_max"]) / 2) / df["width"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["center_y"] = ((df["y_min"] + df["y_max"]) / 2) / df["height"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["w"] = (df["x_max"] - df["x_min"]) / df["width"]
A value is trying to b

In [19]:
df.head()

Unnamed: 0,file_name,width,height,class,x_min,x_max,y_min,y_max,center_x,center_y,w,h
0,501cac52-road-work-vertical.jpg,408,612,SpeedLimitSign,299,374,287,357,0.824755,0.526144,0.183824,0.114379
1,2020-04-04 11:10:21.323753353.jpg,1920,1080,GreenLight,624,680,66,123,0.339583,0.0875,0.029167,0.052778
2,2020-04-04 11:28:23.187401756.jpg,1920,1080,GreenLight,703,743,387,404,0.376563,0.366204,0.020833,0.015741
3,2020-04-04 11:13:23.157114334.jpg,1920,1080,RedLight,774,834,293,322,0.41875,0.284722,0.03125,0.026852
4,2020-04-04 11:13:23.157114334.jpg,1920,1080,RedLight,854,924,292,325,0.463021,0.285648,0.036458,0.030556


### Split data into train and test data

In [20]:
images = df["file_name"].unique()
len(images)

2318

In [21]:
# Split data into 80% train and 20% test
img_df = pd.DataFrame(images, columns=["file_name"])

# Shuffle data and choose 80% of our total amount of images for train
img_train = tuple(img_df.sample(frac=0.80)["file_name"])

# Choose remaining 20% of images for test
img_test = tuple(img_df.query(f"file_name not in {img_train}")["file_name"])

In [22]:
# Create train and test DataFrames
train_df = df.query(f"file_name in {img_train}")
test_df = df.query(f"file_name in {img_test}")

In [23]:
train_df.head()

Unnamed: 0,file_name,width,height,class,x_min,x_max,y_min,y_max,center_x,center_y,w,h
0,501cac52-road-work-vertical.jpg,408,612,SpeedLimitSign,299,374,287,357,0.824755,0.526144,0.183824,0.114379
1,2020-04-04 11:10:21.323753353.jpg,1920,1080,GreenLight,624,680,66,123,0.339583,0.0875,0.029167,0.052778
2,2020-04-04 11:28:23.187401756.jpg,1920,1080,GreenLight,703,743,387,404,0.376563,0.366204,0.020833,0.015741
3,2020-04-04 11:13:23.157114334.jpg,1920,1080,RedLight,774,834,293,322,0.41875,0.284722,0.03125,0.026852
4,2020-04-04 11:13:23.157114334.jpg,1920,1080,RedLight,854,924,292,325,0.463021,0.285648,0.036458,0.030556


In [24]:
test_df.head()

Unnamed: 0,file_name,width,height,class,x_min,x_max,y_min,y_max,center_x,center_y,w,h
17,ce40d81f-00279.jpg,576,352,Car,14,555,51,297,0.493924,0.494318,0.939236,0.698864
22,2020-04-04 11:26:34.218267111.jpg,1920,1080,RedLight,915,950,443,458,0.485677,0.41713,0.018229,0.013889
23,2020-04-04 11:26:34.218267111.jpg,1920,1080,RedLight,962,990,443,458,0.508333,0.41713,0.014583,0.013889
45,7c39676c-202105253052371.jpg,756,567,Semi,58,691,20,533,0.49537,0.487654,0.837302,0.904762
57,2020-04-04 11:27:16.972270606.jpg,1920,1080,WaitLight,1240,1445,217,236,0.699219,0.209722,0.106771,0.017593


### Label Encoding

1. Person
2. Car
3. Truck
4. Semi
5. RedLight
6. YellowLight
7. GreenLight
8. WaitLight
9. StopSign
10. SpeedLimitSign

In [27]:
# Create a "labels" dictionary, with each unique class having its own ID (Algorithmic way)
# classes = df["class"].unique()
# labels = {}

# for i in range(len(classes)):
#     labels[classes[i]] = i

labels = {"Person": 0, "Car": 1, "Truck": 2, "Semi": 3, "RedLight": 4, "YellowLight": 5, "GreenLight": 6, "WaitLight": 7, "StopSign": 8, "SpeedLimitSign": 9}

# Function to get the ID of a specific object
def label_encoding(obj):
    return labels[obj]

label_encoding("Person")

0

In [28]:
train_df["id"] = train_df["class"].apply(label_encoding)
test_df["id"] = test_df["class"].apply(label_encoding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["id"] = train_df["class"].apply(label_encoding)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["id"] = test_df["class"].apply(label_encoding)


In [29]:
train_df.head()

Unnamed: 0,file_name,width,height,class,x_min,x_max,y_min,y_max,center_x,center_y,w,h,id
0,501cac52-road-work-vertical.jpg,408,612,SpeedLimitSign,299,374,287,357,0.824755,0.526144,0.183824,0.114379,9
1,2020-04-04 11:10:21.323753353.jpg,1920,1080,GreenLight,624,680,66,123,0.339583,0.0875,0.029167,0.052778,6
2,2020-04-04 11:28:23.187401756.jpg,1920,1080,GreenLight,703,743,387,404,0.376563,0.366204,0.020833,0.015741,6
3,2020-04-04 11:13:23.157114334.jpg,1920,1080,RedLight,774,834,293,322,0.41875,0.284722,0.03125,0.026852,4
4,2020-04-04 11:13:23.157114334.jpg,1920,1080,RedLight,854,924,292,325,0.463021,0.285648,0.036458,0.030556,4


### Save Images and Labels in Text

In [30]:
train_folder = "images_data/train"
test_folder = "images_data/test"

os.mkdir(train_folder)
os.mkdir(test_folder)

In [47]:
cols = ["file_name", "id", "center_x", "center_y", "w", "h"]
groupby_obj_train = train_df[cols].groupby("file_name")
groupby_obj_test = test_df[cols].groupby("file_name")

In [48]:
# Save each image in the train or test folder, and save respective labels in a text document
def save_data(filename, folder_path, group_obj):
    try:
        # Move image
        src = os.path.join("images_data", filename)
        dst = os.path.join(folder_path, filename)
        move(src, dst) # Move image from the source folder to the destination folder

        # Save the labels
        text_filename = os.path.join(folder_path, os.path.splitext(filename)[0] + ".txt")
        group_obj.get_group(filename).set_index("file_name").to_csv(text_filename, sep=" ", index=False, header=False)
    except FileNotFoundError:
        pass

In [49]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [50]:
filename_series.apply(save_data, args=(train_folder, groupby_obj_train))

0       None
1       None
2       None
3       None
4       None
        ... 
1849    None
1850    None
1851    None
1852    None
1853    None
Length: 1854, dtype: object

In [51]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data, args=(test_folder, groupby_obj_test))

0      None
1      None
2      None
3      None
4      None
       ... 
459    None
460    None
461    None
462    None
463    None
Length: 464, dtype: object

### Data Preparation Done!

In [53]:
df["class"].value_counts()

class
RedLight          1235
GreenLight         816
Car                797
Semi               351
WaitLight          306
Person             131
SpeedLimitSign     107
StopSign           106
Truck               87
YellowLight         75
Name: count, dtype: int64

In [54]:
list(labels.keys())

['Person',
 'Car',
 'Truck',
 'Semi',
 'RedLight',
 'YellowLight',
 'GreenLight',
 'WaitLight',
 'StopSign',
 'SpeedLimitSign']