### Setup

In [1]:
# Imports
import os
import pandas as pd
from glob import glob
from functools import reduce
from xml.etree import ElementTree as et
from shutil import move

In [2]:
# Load XML files and store them in a list
xml_files = glob("annotations_data/*.xml")
print(len(xml_files))

1094


### Data Extraction

In [3]:
# Define function to read XML files, extract file name, size (width, height), and object data (name, xmin, xmax, ymin, max)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    # Get file name
    img_name = root.find("filename").text

    # Get size (width and height) of an image
    width = root.find("size").find("width").text
    height = root.find("size").find("height").text

    # Get data of object(s) within an image
    objs = root.findall("object")
    parser = []

    for obj in objs:
        name = obj.find("name").text
        bnd_box = obj.find("bndbox")
        x_min = bnd_box.find("xmin").text
        x_max = bnd_box.find("xmax").text
        y_min = bnd_box.find("ymin").text
        y_max = bnd_box.find("ymax").text
        parser.append([img_name, width, height, name, x_min, x_max, y_min, y_max])

    return parser

In [4]:
# Extract text from all XML files and flatten the resulting list
parser_all = list(map(extract_text, xml_files))
obj_data = reduce(lambda x, y: x + y, parser_all)

### DataFrame Creation & Data Cleaning

In [5]:
# Create the DataFrame, with each entry being a labeled object in an image
df = pd.DataFrame(obj_data, columns = ["file_name", "width", "height", "class", "x_min", "x_max", "y_min", "y_max"])
df

Unnamed: 0,file_name,width,height,class,x_min,x_max,y_min,y_max
0,501cac52-road-work-vertical.jpg,408,612,SpeedLimitSign,299,374,287,357
1,157c26be-00595.jpg,640,480,Truck,95,243,180,228
2,157c26be-00595.jpg,640,480,Car,52,143,188,234
3,157c26be-00595.jpg,640,480,Car,0,57,188,219
4,157c26be-00595.jpg,640,480,Truck,91,581,143,413
...,...,...,...,...,...,...,...,...
1566,076f6ecb-00525.jpg,500,375,Car,390,433,167,196
1567,076f6ecb-00525.jpg,500,375,Car,474,499,165,197
1568,076f6ecb-00525.jpg,500,375,Car,102,241,167,224
1569,076f6ecb-00525.jpg,500,375,Car,115,388,146,315


In [6]:
# Clean the file_name column
# def clean_filename(filename):
#     return filename.split("-")[-1]

# df["file_name"] = df["file_name"].apply(clean_filename)

In [7]:
df["class"].value_counts()

class
Car               790
Semi              351
Person            130
SpeedLimitSign    107
StopSign          106
Truck              87
Name: count, dtype: int64

### Categories of objects the Model should be able to recognize
1. Person
2. Car
3. Truck
4. Semi
5. RedLight
6. YellowLight
7. GreenLight
8. WaitLight
9. StopSign
10. SpeedLimitSign

In [8]:
# Rename some classes so that they all follow same format (PascalCase)

# df["class"] = df["class"].replace({"red": "RedLight", 
#                      "yellow": "YellowLight", 
#                      "green": "GreenLight", 
#                      "wait_on": "WaitLight"})

# df["class"].value_counts()

In [9]:
# Filter data to get rid of unwanted classes
df = df[(df["class"] != "TrafficLight") & 
        (df["class"] != "TrafficSign") &
        (df["class"] != "off")]

df["class"].value_counts()

class
Car               790
Semi              351
Person            130
SpeedLimitSign    107
StopSign          106
Truck              87
Name: count, dtype: int64

In [10]:
df

Unnamed: 0,file_name,width,height,class,x_min,x_max,y_min,y_max
0,501cac52-road-work-vertical.jpg,408,612,SpeedLimitSign,299,374,287,357
1,157c26be-00595.jpg,640,480,Truck,95,243,180,228
2,157c26be-00595.jpg,640,480,Car,52,143,188,234
3,157c26be-00595.jpg,640,480,Car,0,57,188,219
4,157c26be-00595.jpg,640,480,Truck,91,581,143,413
...,...,...,...,...,...,...,...,...
1566,076f6ecb-00525.jpg,500,375,Car,390,433,167,196
1567,076f6ecb-00525.jpg,500,375,Car,474,499,165,197
1568,076f6ecb-00525.jpg,500,375,Car,102,241,167,224
1569,076f6ecb-00525.jpg,500,375,Car,115,388,146,315


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1571 entries, 0 to 1570
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   file_name  1571 non-null   object
 1   width      1571 non-null   object
 2   height     1571 non-null   object
 3   class      1571 non-null   object
 4   x_min      1571 non-null   object
 5   x_max      1571 non-null   object
 6   y_min      1571 non-null   object
 7   y_max      1571 non-null   object
dtypes: object(8)
memory usage: 98.3+ KB


In [12]:
# Convert types of width, height, x_min, x_max, y_min, y_max from object to integer
cols = ["width", "height", "x_min", "x_max", "y_min", "y_max"]
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1571 entries, 0 to 1570
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   file_name  1571 non-null   object
 1   width      1571 non-null   int64 
 2   height     1571 non-null   int64 
 3   class      1571 non-null   object
 4   x_min      1571 non-null   int64 
 5   x_max      1571 non-null   int64 
 6   y_min      1571 non-null   int64 
 7   y_max      1571 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 98.3+ KB


In [13]:
# Prepare columns for YOLO object detection
# Get center x and center y
df["center_x"] = ((df["x_min"] + df["x_max"]) / 2) / df["width"]
df["center_y"] = ((df["y_min"] + df["y_max"]) / 2) / df["height"]

# Get normalized width and height
df["w"] = (df["x_max"] - df["x_min"]) / df["width"]
df["h"] = (df["y_max"] - df["y_min"]) / df["height"]

In [14]:
df.head()

Unnamed: 0,file_name,width,height,class,x_min,x_max,y_min,y_max,center_x,center_y,w,h
0,501cac52-road-work-vertical.jpg,408,612,SpeedLimitSign,299,374,287,357,0.824755,0.526144,0.183824,0.114379
1,157c26be-00595.jpg,640,480,Truck,95,243,180,228,0.264062,0.425,0.23125,0.1
2,157c26be-00595.jpg,640,480,Car,52,143,188,234,0.152344,0.439583,0.142187,0.095833
3,157c26be-00595.jpg,640,480,Car,0,57,188,219,0.044531,0.423958,0.089063,0.064583
4,157c26be-00595.jpg,640,480,Truck,91,581,143,413,0.525,0.579167,0.765625,0.5625


### Split data into train and test data

In [15]:
images = df["file_name"].unique()
len(images)

1094

In [16]:
# Split data into 80% train and 20% test
img_df = pd.DataFrame(images, columns=["file_name"])

# Shuffle data and choose 80% of our total amount of images for train
img_train = tuple(img_df.sample(frac=0.80)["file_name"])

# Choose remaining 20% of images for test
img_test = tuple(img_df.query(f"file_name not in {img_train}")["file_name"])

In [17]:
# Create train and test DataFrames
train_df = df.query(f"file_name in {img_train}")
test_df = df.query(f"file_name in {img_test}")

In [18]:
train_df.head()

Unnamed: 0,file_name,width,height,class,x_min,x_max,y_min,y_max,center_x,center_y,w,h
0,501cac52-road-work-vertical.jpg,408,612,SpeedLimitSign,299,374,287,357,0.824755,0.526144,0.183824,0.114379
1,157c26be-00595.jpg,640,480,Truck,95,243,180,228,0.264062,0.425,0.23125,0.1
2,157c26be-00595.jpg,640,480,Car,52,143,188,234,0.152344,0.439583,0.142187,0.095833
3,157c26be-00595.jpg,640,480,Car,0,57,188,219,0.044531,0.423958,0.089063,0.064583
4,157c26be-00595.jpg,640,480,Truck,91,581,143,413,0.525,0.579167,0.765625,0.5625


In [19]:
test_df.head()

Unnamed: 0,file_name,width,height,class,x_min,x_max,y_min,y_max,center_x,center_y,w,h
7,d80257fd-00747.jpg,939,704,Car,206,763,306,558,0.515974,0.613636,0.593184,0.357955
11,36c4c908-00458.jpg,500,375,Car,11,481,93,263,0.492,0.474667,0.94,0.453333
13,dcc0dc62-00329.jpg,700,500,Car,0,186,109,205,0.132857,0.314,0.265714,0.192
14,dcc0dc62-00329.jpg,700,500,Car,70,270,113,208,0.242857,0.321,0.285714,0.19
15,dcc0dc62-00329.jpg,700,500,Truck,5,682,103,473,0.490714,0.576,0.967143,0.74


### Label Encoding

1. Person
2. Car
3. Truck
4. Semi
5. StopSign
6. SpeedLimitSign

In [20]:
# Create a "labels" dictionary, with each unique class having its own ID (Algorithmic way)
# classes = df["class"].unique()
# labels = {}

# for i in range(len(classes)):
#     labels[classes[i]] = i

#labels = {"Person": 0, "Car": 1, "Truck": 2, "Semi": 3, "RedLight": 4, "YellowLight": 5, "GreenLight": 6, "WaitLight": 7, "StopSign": 8, "SpeedLimitSign": 9}
labels = {"Person": 0, "Car": 1, "Truck": 2, "Semi": 3, "StopSign": 4, "SpeedLimitSign": 5}

# Function to get the ID of a specific object
def label_encoding(obj):
    return labels[obj]

label_encoding("Person")

0

In [21]:
train_df["id"] = train_df["class"].apply(label_encoding)
test_df["id"] = test_df["class"].apply(label_encoding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["id"] = train_df["class"].apply(label_encoding)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["id"] = test_df["class"].apply(label_encoding)


In [22]:
train_df.head()

Unnamed: 0,file_name,width,height,class,x_min,x_max,y_min,y_max,center_x,center_y,w,h,id
0,501cac52-road-work-vertical.jpg,408,612,SpeedLimitSign,299,374,287,357,0.824755,0.526144,0.183824,0.114379,5
1,157c26be-00595.jpg,640,480,Truck,95,243,180,228,0.264062,0.425,0.23125,0.1,2
2,157c26be-00595.jpg,640,480,Car,52,143,188,234,0.152344,0.439583,0.142187,0.095833,1
3,157c26be-00595.jpg,640,480,Car,0,57,188,219,0.044531,0.423958,0.089063,0.064583,1
4,157c26be-00595.jpg,640,480,Truck,91,581,143,413,0.525,0.579167,0.765625,0.5625,2


### Save Images and Labels in Text

In [23]:
train_folder = "images_data/train"
test_folder = "images_data/test"

os.mkdir(train_folder)
os.mkdir(test_folder)

In [24]:
cols = ["file_name", "id", "center_x", "center_y", "w", "h"]
groupby_obj_train = train_df[cols].groupby("file_name")
groupby_obj_test = test_df[cols].groupby("file_name")

In [25]:
# Save each image in the train or test folder, and save respective labels in a text document
def save_data(filename, folder_path, group_obj):
    try:
        # Move image
        src = os.path.join("images_data", filename)
        dst = os.path.join(folder_path, filename)
        move(src, dst) # Move image from the source folder to the destination folder

        # Save the labels
        text_filename = os.path.join(folder_path, os.path.splitext(filename)[0] + ".txt")
        group_obj.get_group(filename).set_index("file_name").to_csv(text_filename, sep=" ", index=False, header=False)
    except FileNotFoundError:
        pass

In [26]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [27]:
filename_series.apply(save_data, args=(train_folder, groupby_obj_train))

0      None
1      None
2      None
3      None
4      None
       ... 
870    None
871    None
872    None
873    None
874    None
Length: 875, dtype: object

In [28]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data, args=(test_folder, groupby_obj_test))

0      None
1      None
2      None
3      None
4      None
       ... 
214    None
215    None
216    None
217    None
218    None
Length: 219, dtype: object

### Data Preparation Done!

In [29]:
df["class"].value_counts()

class
Car               790
Semi              351
Person            130
SpeedLimitSign    107
StopSign          106
Truck              87
Name: count, dtype: int64

In [30]:
list(labels.keys())

['Person', 'Car', 'Truck', 'Semi', 'StopSign', 'SpeedLimitSign']

1. Person ✅
2. Car ✅
3. Truck ✅
4. Semi ✅
5. RedLight
6. YellowLight
7. GreenLight
8. WaitLight
9. StopSign ✅
10. SpeedLimitSign ✅