# https://docs.ultralytics.com/datasets/pose/hand-keypoints/#dataset-yaml

In [1]:
import IPython
import sys

def clean_notebook():
    IPython.display.clear_output(wait=True)
    print("Notebook cleaned.")

# Run the installation commands
!pip install opendatasets
!pip install ultralytics

# Clean up the notebook
clean_notebook()

Notebook cleaned.


In [2]:

import ultralytics
ultralytics.checks()

Ultralytics 8.3.47 🚀 Python-3.10.12 torch-2.5.1+cu121 CUDA:0 (NVIDIA A100-SXM4-80GB MIG 7g.80gb, 81051MiB)
Setup complete ✅ (256 CPUs, 2015.7 GB RAM, 288.3/3519.1 GB disk)


In [None]:
from ultralytics import YOLO
from IPython.display import Image


# Load a model
model = YOLO("yolo11n-pose.pt")  # load a pretrained model (recommended for training)


# Train the model with the specified configuration
results = model.train(
    data="hand-keypoints.yaml",  # Path to dataset YAML file
    epochs=40,         # Number of training epochs
    imgsz=640,          # Image size (resolution)
    batch=100,           # Batch size
    device=0,           # Specify GPU (0) or CPU ('cpu')
    lr0=0.001,           # Initial learning rate
    optimizer="Adam",   # Optimizer (e.g., 'SGD', 'Adam')
    workers=30,          # Number of dataloader workers
    seed=42,            # Random seed for reproducibility
    patience=10,        # Early stopping patience (number of epochs)
    weight_decay=0.0005,  # Weight decay for regularization
    momentum=0.937,     # Momentum for SGD
    name="hand",    # Name of the experiment folder
    verbose=False        # Print detailed training logs
)


Ultralytics 8.3.47 🚀 Python-3.10.12 torch-2.5.1+cu121 CUDA:0 (NVIDIA A100-SXM4-80GB MIG 7g.80gb, 81051MiB)
[34m[1mengine/trainer: [0mtask=pose, mode=train, model=yolo11n-pose.pt, data=hand-keypoints.yaml, epochs=40, time=None, patience=10, batch=100, imgsz=640, save=True, save_period=-1, cache=False, device=0, workers=30, project=None, name=hand, exist_ok=False, pretrained=True, optimizer=Adam, verbose=False, seed=42, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_crop=False, show_labels

[34m[1mtrain: [0mScanning /workspace/code/week07/datasets/hand-keypoints/train/labels.cache... 18776 images, 0 backgrounds, 491 corrupt: 100%|██████████| 18776/18776 [00:00<?, ?it/s]




[34m[1mval: [0mScanning /workspace/code/week07/datasets/hand-keypoints/val/labels.cache... 7992 images, 0 backgrounds, 184 corrupt: 100%|██████████| 7992/7992 [00:00<?, ?it/s]






Plotting labels to runs/pose/hand/labels.jpg... 
[34m[1moptimizer:[0m Adam(lr=0.001, momentum=0.937) with parameter groups 87 weight(decay=0.0), 97 weight(decay=0.00078125), 96 bias(decay=0.0)
[34m[1mTensorBoard: [0mmodel graph visualization added ✅
Image sizes 640 train, 640 val
Using 30 dataloader workers
Logging results to [1mruns/pose/hand[0m
Starting training for 40 epochs...

      Epoch    GPU_mem   box_loss  pose_loss  kobj_loss   cls_loss   dfl_loss  Instances       Size


       1/40        15G       1.06      10.18     0.6658     0.9737      1.398        206        640:  69%|██████▉   | 127/183 [05:25<03:24,  3.66s/it]

In [None]:
Image("./runs/pose/hand/results.png", width=1200)

In [None]:
from ultralytics import YOLO
from IPython.display import Image


# Load a model
model = YOLO("./runs/pose/tiger/weights/best.pt")  # load a tiger-pose trained model



# Perform object detection on an image
results = model("./datasets/tiger-pose/val/images/Frame_235.jpg")
results[0].show()

In [None]:
results[0].keypoints.data

#  Pose Estimation with opencv

In [None]:
import cv2
import matplotlib.pyplot as plt

# Load the image
image = cv2.imread("./datasets/tiger-pose/val/images/Frame_235.jpg")
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

results = model(image_rgb,verbose=False)

# Iterate over detected persons
for person in results[0].keypoints.data.cpu().numpy():
   
    for idx, keypoint in enumerate(person):
        x, y = keypoint
        cv2.circle(image_rgb, (int(x), int(y)), radius=10, color=(0, 255, 0), thickness=-1)
        cv2.putText(image_rgb, str(idx), (int(x), int(y)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)

# Display the image using Matplotlib
plt.figure(figsize=(10, 10))
plt.imshow(image_rgb)
plt.axis('off')  # Hide axes
plt.show()

# To integrate reading a YouTube stream with Pose

In [None]:
import cv2
from IPython.display import display, Image, clear_output
from pytubefix import YouTube
from ultralytics import YOLO

# Set the desired width for resizing frames
set_width = 800

# Load your YOLO pose estimation model
model = YOLO("./runs/pose/hand/weights/best.pt")  # Replace with your pose estimation model

# YouTube video URL
video_url = "https://youtu.be/qm2-kiYSzSs?si=Oa4gU8Ev-wnpzEmd"

# Fetch the video stream URL using pytube
yt = YouTube(video_url)
video_stream = yt.streams.filter(file_extension='mp4', progressive=True).first()

if not video_stream:
    print("No compatible video stream found.")
    raise RuntimeError("Failed to fetch video stream.")

# Get the stream URL
stream_url = video_stream.url

# Open the YouTube stream in OpenCV
cap = cv2.VideoCapture(stream_url)

if not cap.isOpened():
    print(f"Error: Could not open YouTube video stream.")
    cap.release()
    raise RuntimeError("Video initialization failed.")

try:
    while True:
        # Read a frame from the video
        ret, frame = cap.read()
        if not ret:
            print("End of video or failed to grab frame.")
            break

        # Resize the frame for faster processing
        ratio = set_width / frame.shape[1]
        frame = cv2.resize(frame, (set_width, int(frame.shape[0] * ratio)))

        # Run pose estimation on the frame
        results = model(frame, verbose=False)

        # Iterate over detected persons and draw keypoints
        for person in results[0].keypoints.data.cpu().numpy():
            for idx, keypoint in enumerate(person):
                x, y = keypoint
                cv2.circle(frame, (int(x), int(y)), radius=5, color=(0, 255, 0), thickness=-1)
                cv2.putText(frame, str(idx), (int(x), int(y)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)

        # Encode the frame as JPEG
        _, buffer = cv2.imencode('.jpg', frame)
        img_bytes = buffer.tobytes()

        # Display the frame inline in the notebook
        display(Image(data=img_bytes))
        clear_output(wait=True)
finally:
    cap.release()
