# Setup and Imports

In [None]:
# Connect to Drive
from google.colab import drive
drive.mount('/content/drive')

# Folder contains all images with front-view of box and corresponding xml files
%cd /content/drive/MyDrive/DeepVis/ProjectSubmissionCode/DataPreparation_for_ImageClassification/Camfront_all

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1ynjnztWTKNm61JB9EqcrKvlbbIcz0MRu/DeepVis/Data/Camfront_all


In [None]:
!pip install xmltodict

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xmltodict
  Downloading xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict
Successfully installed xmltodict-0.13.0


In [None]:
import xmltodict
import cv2 as cv
import numpy as np
from google.colab.patches import cv2_imshow
import glob, os
import pandas as pd

# Crop Components

In [None]:
# Read all xml files as python dicts and append to list
xml_list = []

for file in glob.glob('*.xml'):
  with open(file, 'r') as file:
    my_xml = file.read()
    my_dict = xmltodict.parse(my_xml)
    xml_list.append(my_dict)

In [None]:
# Store cropped images and labels in separate lists
# As both lists will have the same length, using the same index on both lists will access the corresponding cropped image and label
cropped = []
labels = []

# Read image and store
for file in glob.glob('*.jpg'):
  filename = file
  img = cv.imread(file)

  # Search for matching xml-file
  for xml_file in xml_list:
    if filename == xml_file['annotation']['filename']:
      
      # Loop through all hardware components of xml-file
      for i in range(len(xml_file['annotation']['object'])):

        # Append label of component
        labels.append(xml_file['annotation']['object'][i]['name'])

        # Extract bounding box of component
        xmin = int(xml_file["annotation"]["object"][i]['bndbox']["xmin"])
        xmax = int(xml_file["annotation"]["object"][i]['bndbox']["xmax"])
        ymin = int(xml_file["annotation"]["object"][i]['bndbox']["ymin"])
        ymax = int(xml_file["annotation"]["object"][i]['bndbox']["ymax"])

        # Crop image to fit component
        crop_img = img[ymin:ymax, xmin:xmax]

        # Append cropped image
        cropped.append(crop_img)

In [None]:
# Filter out CamFront from labels and cropped images, as it is not a real component
labels_clean = []
cropped_clean = []

for i in range(len(labels)):
  if labels[i] != 'CamFront':
    labels_clean.append(labels[i])
    cropped_clean.append(cropped[i])

In [None]:
label_dict = {}
comp_count = 0

for label in labels_clean:
  if label not in label_dict:
      label_dict[label] = 1
      comp_count += 1
  else:
    label_dict[label] += 1

print(f'Unique components: {comp_count}')
print(f'Total number of cropped images exported: {len(cropped_clean)}')
print('----------------------')

for label in label_dict:
  print(f'{label}: {label_dict[label]}')

Unique components: 24
Total number of cropped images exported: 4762
----------------------
Schraube: 605
Display_Port: 199
USB_2: 491
USB_3: 250
Ethernet: 305
Powerstecker_on: 49
Schraube_rund: 534
Schraube_abstand: 435
COM_male: 106
VGA: 55
LED_PWR: 199
LED_SSD: 55
Schraube_gespiegelt: 452
LOGO_Kontron_font: 106
Reset_button: 199
Powerstecker_off: 57
Schraube_halb: 115
Loch: 76
LED_HDD: 51
LOGO_Kontron_icon: 51
Power_5V: 93
Line_out: 93
LOGO_AllenBradley_icon: 93
LOGO_AllenBradley_font: 93


# Write Cropped JPGs

In [None]:
# --------> MANUALLY CHANGE DIRECTORY TO WHERE YOU WANT TO WRITE CROPPED IMAGES <--------

# Export cropped images of hardware features, label is included in the new filename
# Count prevents duplicate filenames, as those would just be overwritten
count = 0
for i in range(len(labels_clean)):
  
  # --------> UNCOMMENT TO WRITE JPGs TO CURRENT DIRECTORY <--------
  #cv.imwrite(f'{labels_clean[i]}_{count}.jpg', cropped_clean[i])
  
  count += 1