In [1]:
!pip install boto3 sagemaker comet_ml torch torchvision ultralytics



# Install Libraries

In [2]:
import boto3
import os
import random
import shutil
from pathlib import Path
from datetime import datetime
from ultralytics import YOLO
import comet_ml

# Connect to S3

In [3]:
# Setup boto3 clients
s3 = boto3.client('s3')
ssm = boto3.client('ssm')

In [4]:
# Get parameters from SSM
def get_parameters():
    response = ssm.get_parameters(
        Names=[
            '/edge-ai/bucket-name',
            '/edge-ai/comet-ml-api-key'
        ],
        WithDecryption=True
    )
    return {param['Name'].split('/')[-1]: param['Value'] for param in response['Parameters']}

In [5]:
params = get_parameters()
BUCKET_NAME = params['bucket-name']
COMET_ML_API_KEY = params['comet-ml-api-key']

In [6]:
# S3 paths
s3_img_prefix = 'training_data/new_data/images/'
s3_lbl_prefix = 'training_data/new_data/txt_files/'

# Local directories
base_dir = Path('/home/ec2-user/SageMaker/tmp/datasets')
train_img_dir = base_dir / 'train/images'
train_lbl_dir = base_dir / 'train/labels'
val_img_dir = base_dir / 'val/images'
val_lbl_dir = base_dir / 'val/labels'

In [7]:
# Create folders
for path in [train_img_dir, train_lbl_dir, val_img_dir, val_lbl_dir]:
    path.mkdir(parents=True, exist_ok=True)

# Load New Image Data

In [8]:
# List images
img_objs = s3.list_objects_v2(Bucket=BUCKET_NAME, Prefix=s3_img_prefix).get('Contents', [])
img_keys = [obj['Key'] for obj in img_objs if obj['Key'].endswith(('.jpg', '.png'))]
print(f"Done number of images {len(img_keys)}")

Done number of images 11


# Preprocess Images

In [9]:
# Random split
random.shuffle(img_keys)
split_idx = int(len(img_keys) * 0.1)
val_keys = img_keys[:split_idx]
train_keys = img_keys[split_idx:]
print(val_keys)
print(train_keys)

['training_data/new_data/images/10_789_20250419155551.jpg']
['training_data/new_data/images/2_VRFReMfzWFVwbyL3cu5vcaQGu5N2_20250419162802.jpg', 'training_data/new_data/images/1_789_20250419120151.jpg', 'training_data/new_data/images/3_789_20250419121525.jpg', 'training_data/new_data/images/2_789_20250419121153.jpg', 'training_data/new_data/images/1_VRFReMfzWFVwbyL3cu5vcaQGu5N2_20250419162627.jpg', 'training_data/new_data/images/5_789_20250419121631.jpg', 'training_data/new_data/images/9_789_20250419155239.jpg', 'training_data/new_data/images/8_789_20250419153055.jpg', 'training_data/new_data/images/11_789_20250419160133.jpg', 'training_data/new_data/images/4_789_20250419121603.jpg']


In [10]:
def download_and_place(keys, img_dest, lbl_dest):
    for key in keys:
        filename = os.path.basename(key)
        label_filename = filename.rsplit('.', 1)[0] + '.txt'
        label_key = s3_lbl_prefix + label_filename

        # Download image
        s3.download_file(BUCKET_NAME, key, str(img_dest / filename))
        
        # Download label if exists
        try:
            s3.download_file(BUCKET_NAME, label_key, str(lbl_dest / label_filename))
        except:
            print(f"Label file not found for {filename}, skipping.")

In [11]:
# Download images & labels
download_and_place(train_keys, train_img_dir, train_lbl_dir)
download_and_place(val_keys, val_img_dir, val_lbl_dir)

print("✅ Data prepared in /tmp/datasets/")

✅ Data prepared in /tmp/datasets/


# Load yaml file

In [12]:
yaml_key = 'training_data/new_data/data.yaml'

# Local path
local_yaml_path = Path('/home/ec2-user/SageMaker/tmp/datasets/data.yaml')
local_yaml_path.parent.mkdir(parents=True, exist_ok=True)

# Download data.yaml
s3.download_file(Bucket=BUCKET_NAME, Key=yaml_key, Filename=str(local_yaml_path))

print(f"✅ Downloaded data.yaml to {local_yaml_path}")

✅ Downloaded data.yaml to /home/ec2-user/SageMaker/tmp/datasets/data.yaml


# Load Latest Model

In [13]:
# List all model files and find the latest one
def get_latest_model_key(bucket_name):
    paginator = s3.get_paginator('list_objects_v2')
    result = paginator.paginate(Bucket=bucket_name, Prefix='models/')

    latest_key = None
    latest_time = datetime.min

    for page in result:
        for obj in page.get('Contents', []):
            key = obj['Key']
            if key.endswith('last.pt'):
                try:
                    parts = key.split('/')
                    date_str = f"{parts[1]}-{parts[2]}-{parts[3]}"
                    obj_date = datetime.strptime(date_str, '%Y-%m-%d')

                    if obj_date > latest_time:
                        latest_time = obj_date
                        latest_key = key
                except (IndexError, ValueError):
                    continue

    return latest_key

# Get the latest model key
latest_model_key = get_latest_model_key(BUCKET_NAME)

if latest_model_key:
    local_model_path = Path('./tmp/datasets/latest_model.pt')
    s3.download_file(BUCKET_NAME, latest_model_key, str(local_model_path))
    print(f"✅ Downloaded latest model to {local_model_path} from s3 location: {latest_model_key}")
else:
    print("❌ No model file found.")

✅ Downloaded latest model to tmp/datasets/latest_model.pt from s3 location: models/2025/04/19/last.pt


# Retrain Model

In [14]:
comet_ml_api_key = COMET_ML_API_KEY

In [15]:
# Set your Comet Api Key
!export COMET_API_KEY=comet_ml_api_key

In [16]:
comet_ml.login(project_name="IoT")

In [17]:
# Load a model
model = YOLO(local_model_path)  # load a pretrained model (recommended for training)

# Train the model
results = model.train(
    data=local_yaml_path,
    epochs=10,
    imgsz=640,
    batch=8,
    project="IoT",
    save_period=1,
    save_json=True,
)

Ultralytics 8.3.111 🚀 Python-3.10.16 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 14918MiB)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=tmp/datasets/latest_model.pt, data=/home/ec2-user/SageMaker/tmp/datasets/data.yaml, epochs=10, time=None, patience=100, batch=8, imgsz=640, save=True, save_period=1, cache=False, device=None, workers=8, project=IoT, name=train, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=True, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_crop=False, show_labels=Tr

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/ranxdug/iot/0f124309fb8141db9be2e92beb9b0d26

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/home/ec2-user/SageMaker' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Freezing layer 'model.23.dfl.conv.weight'
[34m[1mAMP: [0mrunning Automatic Mixed Precision (AMP) checks...
Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n.pt to 'yolo11n.pt'...


100%|██████████| 5.35M/5.35M [00:00<00:00, 225MB/s]


[34m[1mAMP: [0mchecks passed ✅
[34m[1mtrain: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 1721.1±451.5 MB/s, size: 53.4 KB)


[34m[1mtrain: [0mScanning /home/ec2-user/SageMaker/tmp/datasets/train/labels... 10 images, 0 backgrounds, 0 corrupt: 100%|██████████| 10/10 [00:00<00:00, 673.64it/s]

[34m[1mtrain: [0mNew cache created: /home/ec2-user/SageMaker/tmp/datasets/train/labels.cache





[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 589.8±0.0 MB/s, size: 56.4 KB)


[34m[1mval: [0mScanning /home/ec2-user/SageMaker/tmp/datasets/val/labels... 1 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1/1 [00:00<00:00, 6831.11it/s]

[34m[1mval: [0mNew cache created: /home/ec2-user/SageMaker/tmp/datasets/val/labels.cache





Plotting labels to IoT/train/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.000714, momentum=0.9) with parameter groups 185 weight(decay=0.0), 198 weight(decay=0.0005), 197 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 4 dataloader workers
Logging results to [1mIoT/train[0m
Starting training for 10 epochs...
Closing dataloader mosaic

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       1/10      8.82G      3.889      11.29      3.814          4        640: 100%|██████████| 2/2 [00:01<00:00,  1.31it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:00<00:00,  7.55it/s]

                   all          1          3          0          0          0          0






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       2/10      8.85G      5.046      10.33      4.612          4        640: 100%|██████████| 2/2 [00:00<00:00,  2.34it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:00<00:00, 18.74it/s]

                   all          1          3          0          0          0          0






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       3/10      8.97G      4.707      11.64       3.88          3        640: 100%|██████████| 2/2 [00:00<00:00,  2.19it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:00<00:00, 21.28it/s]

                   all          1          3          0          0          0          0






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       4/10      9.08G      4.713      10.09      4.449          3        640: 100%|██████████| 2/2 [00:00<00:00,  2.31it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:00<00:00, 22.27it/s]

                   all          1          3          0          0          0          0






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       5/10      9.15G      3.681       10.2      3.538          5        640: 100%|██████████| 2/2 [00:00<00:00,  2.45it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:00<00:00, 22.27it/s]

                   all          1          3          0          0          0          0






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       6/10      9.15G      4.112      10.43      3.432          4        640: 100%|██████████| 2/2 [00:00<00:00,  2.45it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:00<00:00, 21.27it/s]

                   all          1          3          0          0          0          0






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       7/10      9.15G      4.228      9.928      4.037          5        640: 100%|██████████| 2/2 [00:00<00:00,  2.45it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:00<00:00, 20.58it/s]

                   all          1          3          0          0          0          0






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       8/10      9.15G      4.301      9.129      3.569          5        640: 100%|██████████| 2/2 [00:00<00:00,  2.44it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:00<00:00, 21.94it/s]

                   all          1          3          0          0          0          0






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       9/10      9.15G      4.561      8.945      3.559          3        640: 100%|██████████| 2/2 [00:00<00:00,  2.46it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:00<00:00, 18.14it/s]

                   all          1          3          0          0          0          0






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      10/10      9.15G      3.646      8.714      3.093          4        640: 100%|██████████| 2/2 [00:00<00:00,  2.44it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:00<00:00, 19.92it/s]

                   all          1          3          0          0          0          0






10 epochs completed in 0.013 hours.
Optimizer stripped from IoT/train/weights/last.pt, 64.1MB
Optimizer stripped from IoT/train/weights/best.pt, 64.1MB

Validating IoT/train/weights/best.pt...
Ultralytics 8.3.111 🚀 Python-3.10.16 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 14918MiB)
YOLOv10x summary (fused): 192 layers, 29,406,158 parameters, 0 gradients, 160.0 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:00<00:00, 49.62it/s]


                   all          1          3          0          0          0          0
                   PCT          1          1          0          0          0          0
            Free_L_Max          1          1          0          0          0          0
   Not_Free_Center_Max          1          1          0          0          0          0
Speed: 0.3ms preprocess, 15.8ms inference, 0.0ms loss, 0.2ms postprocess per image
Saving IoT/train/predictions.json...
Results saved to [1mIoT/train[0m


[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : constitutional_brush_7065
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/ranxdug/iot/0f124309fb8141db9be2e92beb9b0d26
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     lr/pg0 [11]             : (7.14e-06, 3.96627e-05)
[1;38;5;39mCOMET INFO:[0m     lr/pg1 [11]             : (7.14e-06, 3.96627e-05)
[1;38;5;39mCOMET INFO:[0m     lr/pg2 [11]             : (7.14e-06, 3.96627e-05)
[1;38;5;39mCOMET INFO:[0m     metrics/mAP50(B)        : 0.0
[1;38;5;39mCOMET INFO:[0m     metrics/mAP50-9

# Test Model Accuracy

In [18]:
# Validate the model
metrics = model.val()  # no arguments needed, dataset and settings remembered
metrics.box.map  # map50-95
metrics.box.map50  # map50
metrics.box.map75  # map75
metrics.box.maps  # a list contains map50-95 of each category

Ultralytics 8.3.111 🚀 Python-3.10.16 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 14918MiB)
YOLOv10x summary (fused): 192 layers, 29,406,158 parameters, 0 gradients, 160.0 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 1807.4±0.0 MB/s, size: 56.4 KB)


[34m[1mval: [0mScanning /home/ec2-user/SageMaker/tmp/datasets/val/labels.cache... 1 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1/1 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:00<00:00, 18.66it/s]


                   all          1          3          0          0          0          0
                   PCT          1          1          0          0          0          0
            Free_L_Max          1          1          0          0          0          0
   Not_Free_Center_Max          1          1          0          0          0          0
Speed: 0.6ms preprocess, 45.2ms inference, 0.0ms loss, 0.2ms postprocess per image
Saving IoT/train2/predictions.json...
Results saved to [1mIoT/train2[0m


array([          0,           0,           0,           0,           0,           0,           0,           0,           0,           0])

# Save best.pt and last.pt to S3

In [19]:
# Define the local model path (update if you saved elsewhere)
MODEL_DIR = "/home/ec2-user/SageMaker/IoT/train/weights"
BEST_MODEL = os.path.join(MODEL_DIR, "best.pt")
LAST_MODEL = os.path.join(MODEL_DIR, "last.pt")

# Create destination S3 path using current date
now = datetime.now()
s3_prefix = f"models/{now.year}/{now.month:02}/{now.day:02}"

# Upload files
def upload_model(file_path, file_name):
    s3_path = f"{s3_prefix}/{file_name}"
    s3.upload_file(file_path, BUCKET_NAME, s3_path)
    print(f"✅ Uploaded {file_name} to s3://{BUCKET_NAME}/{s3_path}")

upload_model(BEST_MODEL, "best.pt")
upload_model(LAST_MODEL, "last.pt")

✅ Uploaded best.pt to s3://edge-ai-s3/models/2025/04/19/best.pt
✅ Uploaded last.pt to s3://edge-ai-s3/models/2025/04/19/last.pt


In [20]:
def move_images_to_all_data():
    """ 
    Move all the images that have been trained to the training_data/all_data/yyyy/MM/dd
    """
    date_path = datetime.utcnow().strftime('%Y/%m/%d')
    destination_prefix = f"training_data/all_data/{date_path}/"

    folders_to_move = {
        'images': 'training_data/new_data/images/',
        'txt_files': 'training_data/new_data/txt_files/'
    }

    for file_type, prefix in folders_to_move.items():
        response = s3.list_objects_v2(Bucket=BUCKET_NAME, Prefix=prefix)

        if 'Contents' not in response:
            print(f"No files found in {prefix}")
            continue

        for obj in response['Contents']:
            source_key = obj['Key']
            if source_key.endswith('/'):
                continue  # skip folders

            file_name = source_key.split('/')[-1]
            destination_key = f"{destination_prefix}{file_name}"

            # Copy the object
            s3.copy_object(
                Bucket=BUCKET_NAME,
                CopySource={'Bucket': BUCKET_NAME, 'Key': source_key},
                Key=destination_key
            )

            # Delete the original object
            s3.delete_object(Bucket=BUCKET_NAME, Key=source_key)
            print(f"Moved: {source_key} -> {destination_key}")

In [21]:
move_images_to_all_data()

Moved: training_data/new_data/images/10_789_20250419155551.jpg -> training_data/all_data/2025/04/19/10_789_20250419155551.jpg
Moved: training_data/new_data/images/11_789_20250419160133.jpg -> training_data/all_data/2025/04/19/11_789_20250419160133.jpg
Moved: training_data/new_data/images/1_789_20250419120151.jpg -> training_data/all_data/2025/04/19/1_789_20250419120151.jpg
Moved: training_data/new_data/images/1_VRFReMfzWFVwbyL3cu5vcaQGu5N2_20250419162627.jpg -> training_data/all_data/2025/04/19/1_VRFReMfzWFVwbyL3cu5vcaQGu5N2_20250419162627.jpg
Moved: training_data/new_data/images/2_789_20250419121153.jpg -> training_data/all_data/2025/04/19/2_789_20250419121153.jpg
Moved: training_data/new_data/images/2_VRFReMfzWFVwbyL3cu5vcaQGu5N2_20250419162802.jpg -> training_data/all_data/2025/04/19/2_VRFReMfzWFVwbyL3cu5vcaQGu5N2_20250419162802.jpg
Moved: training_data/new_data/images/3_789_20250419121525.jpg -> training_data/all_data/2025/04/19/3_789_20250419121525.jpg
Moved: training_data/new_dat

In [22]:
# Clean training runs
!rm -rf /home/ec2-user/SageMaker/IoT

# Clean temporary datasets
!rm -rf /home/ec2-user/SageMaker/tmp

# Clean SageMaker Trash completely
!rm -rf /home/ec2-user/SageMaker/yolo11n.pt