In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# !git clone https://github.com/ultralytics/yolov5

Cloning into 'yolov5'...
remote: Enumerating objects: 16026, done.[K
remote: Counting objects: 100% (59/59), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 16026 (delta 33), reused 41 (delta 25), pack-reused 15967[K
Receiving objects: 100% (16026/16026), 14.68 MiB | 2.67 MiB/s, done.
Resolving deltas: 100% (10999/10999), done.


In [3]:
# !pip install -U -r yolov5/requirements.txt

Collecting gitpython>=3.1.30 (from -r yolov5/requirements.txt (line 5))
  Downloading GitPython-3.1.40-py3-none-any.whl (190 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/190.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m184.3/190.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting matplotlib>=3.3 (from -r yolov5/requirements.txt (line 6))
  Downloading matplotlib-3.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
Collecting numpy>=1.22.2 (from -r yolov5/requirements.txt (line 7))
  Downloading numpy-1.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

### Prepare Data

In [1]:
import os
import pandas as pd
from PIL import Image
from tqdm import tqdm
import shutil

In [2]:
# Read Data from drive
files_dir = '/content/drive/MyDrive/Capstone_GE_DSI_CV_Project/preprocessed_data/lung_cancer/data'
# test_dir = '/content/drive/MyDrive/Capstone_GE_DSI_CV_Project/preprocessed_data/lung_cancer/test'
labels_dir = '/content/drive/MyDrive/Capstone_GE_DSI_CV_Project/preprocessed_data/lung_cancer/labels/labels.csv'

In [3]:
labels = pd.read_csv('/content/drive/MyDrive/Capstone_GE_DSI_CV_Project/preprocessed_data/lung_cancer/labels/labels.csv', header=None)

In [4]:
labels.head(5)

Unnamed: 0,0,1,2,3,4,5
0,PETCT_55ae7986e1_axial_357.jpg,205,165,207,169,lung
1,PETCT_55ae7986e1_axial_358.jpg,203,163,208,170,lung
2,PETCT_55ae7986e1_axial_359.jpg,203,163,208,170,lung
3,PETCT_55ae7986e1_axial_360.jpg,203,164,208,170,lung
4,PETCT_3b26172779_axial_123.jpg,153,210,163,217,lung


In [5]:
img_names = labels.iloc[:, 0].to_list()

In [6]:
def convert_to_yolo_format(label, x_left, x_right, y_left, y_right, image_width, image_height):
    # Calculate the center coordinates
    x_center = (x_left + x_right) / 2.0
    y_center = (y_left + y_right) / 2.0

    # Calculate the width and height of the bounding box
    w = x_right - x_left
    h = y_right - y_left

    # Normalize the coordinates and dimensions relative to the image width and height
    x_center /= image_width
    y_center /= image_height
    w /= image_width
    h /= image_height

    # Return the YOLO-formatted string
    yolo_format = f"{label} {x_center} {y_center} {w} {h}"

    return yolo_format

In [7]:
convert_to_yolo_format(1,194 , 203, 174, 183, 400, 400)

'1 0.49625 0.44625 0.0225 0.0225'

In [8]:
!pip install python-magic



In [9]:
import magic
import regex as re

t = magic.from_file(os.path.join(files_dir, img_names[0]))
print(t)
t.split(",")[-2].strip().split("x")
# re.search('(\d+) x (\d+)', t).groups()

JPEG image data, JFIF standard 1.01, aspect ratio, density 1x1, segment length 16, baseline, precision 8, 408x408, components 3


['408', '408']

In [10]:
def get_dims(path_to_img):
  t = magic.from_file(path_to_img)
  dims = t.split(",")[-2].strip().split("x")
  return [float(i) for i in dims]

In [11]:
get_dims(os.path.join(files_dir, img_names[10000]))

[408.0, 408.0]

In [12]:
im_dim_dict = {}

In [13]:
# store labels in nested structure
lab_dir_write = '/content/labels'

for elem in tqdm(img_names):
  row = labels.loc[labels[0] == elem]
  row = row.values[0]

  row[2], row[3] = row[3], row[2]
  if row[1] > row[2]:
    row[1], row[2] = row[2], row[1]

  if row[3] > row[4]:
    row[3], row[4] = row[4], row[3]

  # if elem not in im_dim_dict:
  #   # im_dim_dict[elem] = Image.open(os.path.join(files_dir, elem)).size
  #   im_dim_dict[elem] = get_dims(os.path.join(files_dir, elem))

  im_width, im_height = 408, 408

  yolo_format = convert_to_yolo_format(0, row[1], row[2], row[3], row[4], im_width, im_height)

  elem = elem.replace(".jpg", ".txt")
  with open(os.path.join(lab_dir_write, elem), 'w') as f:
    f.write(yolo_format)

100%|██████████| 14328/14328 [00:39<00:00, 366.66it/s]


In [14]:
# label_folder = '/content/drive/MyDrive/Capstone_GE_DSI_CV_Project/preprocessed_data/yolov5/yolov5_lung_cancer_labels'

# for label_file in tqdm(os.listdir(label_folder)):
#   # print(label_file)
#   # print(label_file[6:])

#   if label_file[:6] == 'labels':
#     !mv {os.path.join(label_folder, label_file)} {os.path.join(label_folder, label_file[6:])}

#   # break

In [None]:
org_img_folder = '/content/drive/MyDrive/Capstone_GE_DSI_CV_Project/preprocessed_data/lung_cancer/data'
labels_dir = '/content/drive/MyDrive/Capstone_GE_DSI_CV_Project/preprocessed_data/yolov5/yolov5_lung_cancer_labels'
yolo_folder = '/content/drive/MyDrive/Capstone_GE_DSI_CV_Project/preprocessed_data/yolov5'
yolo_data_folder = os.path.join(yolo_folder, 'data')

In [16]:
import os

# Create a folder structure for YOLOv5 training
if not os.path.exists('data'):
    for folder in ['images', 'labels']:
        for split in ['train', 'val', 'test']:
            os.makedirs(f'data/{folder}/{split}')

In [18]:
def split_dataset(image_names):
  # Get unique users
  unique_users = list(set([i.split('_')[1] for i in image_names]))

  # Assign in 60 - 20 - 20 split
  total_length = len(unique_users)
  list1_length = int(total_length * 0.6)
  list2_length = int(total_length * 0.2)

  # Use list slicing to split the original list
  train = unique_users[:list1_length]
  val = unique_users[list1_length:(list1_length + list2_length)]
  test = unique_users[(list1_length + list2_length):]


  print(len(unique_users), len(train), len(val), len(test))

  for image_name in tqdm(image_names):
      # Label filename
      label_name = image_name.replace('.jpg', '.txt')

      # Get the user id
      user_id = label_name.split("_")[1]
      split = None
      # Split into train, val, or test
      if user_id in train:
          split = 'train'
      elif user_id in val:
          split = 'val'
      elif user_id in test:
          split = 'test'

      if split:
        # Source paths
        source_image_path = "/content/dataset/"+image_name
        source_label_path = "/content/labels/"+label_name

        # Destination paths
        target_image_folder = f'data/images/{split}'
        target_label_folder = f'data/labels/{split}'

        # Copy files
        # shutil.copy(source_image_path, target_image_folder)
        # shutil.copy(source_label_path, target_label_folder)
        !cp {source_image_path} {os.path.join(target_image_folder, image_name)}
        !cp {source_label_path} {os.path.join(target_label_folder, label_name)}


In [15]:
# shutil.rmtree('/content/data/')

In [29]:
# !gdown /content/drive/MyDrive/Capstone_GE_DSI_CV_Project/preprocessed_data/lung_cancer/data
! cp -r /content/drive/MyDrive/Capstone_GE_DSI_CV_Project/preprocessed_data/lung_cancer/data /content/dataset

In [19]:
split_dataset(img_names)

166 99 33 34


100%|██████████| 14328/14328 [49:43<00:00,  4.80it/s]


In [None]:
# def progress_callback(current, total):
#     # Called with the current and total number of bytes copied
#     print("Copied {} of {} bytes".format(current, total))

# # shutil.copytree(src, dst, dirs_exist_ok=True)

# shutil.copytree('/content/data', '/content/drive/MyDrive/CapstoneYoloData', dirs_exist_ok=True)

KeyboardInterrupt: ignored

In [20]:
!python yolov5/train.py --data lc_yolo2.yaml --weights yolov5s.pt --epochs 10 --batch 16 --freeze 10

2023-10-27 23:50:11.513182: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-27 23:50:11.513238: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-27 23:50:11.513285: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mtrain: [0mweights=yolov5s.pt, cfg=, data=lc_yolo2.yaml, hyp=yolov5/data/hyps/hyp.scratch-low.yaml, epochs=10, batch_size=16, imgsz=640, rect=False, resume=False, nosave=False, noval=False, noautoanchor=False, noplots=False, evolve=None, bucket=, cache=None, image_weights=False, device=, multi_scale=False, single_cls=False, optimizer=SGD, sync_bn=Fals

In [21]:
!python yolov5/val.py --data lc_yolo2.yaml --weights /content/yolov5/runs/train/exp/weights/best.pt

[34m[1mval: [0mdata=lc_yolo2.yaml, weights=['/content/yolov5/runs/train/exp/weights/best.pt'], batch_size=32, imgsz=640, conf_thres=0.001, iou_thres=0.6, max_det=300, task=val, device=, workers=8, single_cls=False, augment=False, verbose=False, save_txt=False, save_hybrid=False, save_conf=False, save_json=False, project=yolov5/runs/val, name=exp, exist_ok=False, half=False, dnn=False
YOLOv5 🚀 v7.0-230-g53efd07 Python-3.10.12 torch-2.1.0+cu118 CUDA:0 (Tesla T4, 15102MiB)

Fusing layers... 
Model summary: 157 layers, 7012822 parameters, 0 gradients, 15.8 GFLOPs
[34m[1mval: [0mScanning /content/data/labels/val.cache... 2471 images, 0 backgrounds, 0 corrupt: 100% 2471/2471 [00:00<?, ?it/s]
                 Class     Images  Instances          P          R      mAP50   mAP50-95: 100% 78/78 [00:36<00:00,  2.11it/s]
                   all       2471       2471      0.421      0.409      0.342      0.137
Speed: 0.3ms pre-process, 6.3ms inference, 2.0ms NMS per image at shape (32, 3, 640,