In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataset_path = "/content/drive/MyDrive/Infosys/css-data"

In [None]:
import os
for root, dirs, files in os.walk(dataset_path):
    level = root.replace(dataset_path, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = ' ' * 2 * (level + 1)
    for f in files[:5]:
        print(f"{subindent}{f}")

css-data/
  README.dataset.txt
  README.roboflow.txt
  css-data.yaml
  valid/
    images/
      -1969-_png_jpg.rf.41dd58ed3ae83df95fb2417c679d581f.jpg
      -1079-_png_jpg.rf.19092a3937930012f9fd9c1ce57f5a7b.jpg
      -1429-_png_jpg.rf.78a7894e86c79d018d80fa86f4d000f8.jpg
      construction-2-_mp4-38_jpg.rf.0bb63aba0a9ebe5a4741a6207e2e1902.jpg
      autox3_mp4-78_jpg.rf.dc5c00104c4cf733c2c06c820b82d338.jpg
    labels/
      construction-4-_mp4-20_jpg.rf.cb627b855fa08d83357febb83f6ad4bc.txt
      004424_jpg.rf.0470713b945b08839105cde711db62d9.txt
      02646_jpg.rf.5c93ba95bdc03808bcf872c7218ac5ef.txt
      IMG_3100_mp4-1_jpg.rf.7b4a6df995ec2702dee6e7f8c5b47e14.txt
      IMG_3093_mp4-22_jpg.rf.ea118a6046b21e9246efd53599dfdc41.txt
  train/
    labels/
      image_170_jpg.rf.2be45261d9b8c9e3ad48f1d5992a81e0.txt
      image_246_jpg.rf.5b09f63561fa416c6e982b459acf98d4.txt
      image_426_jpg.rf.c89dce1cff780e9d86a5bf5f3b16459c.txt
      image_28_jpg.rf.83228814ea921754456d144a3d128bf4.txt
 

In [None]:
import glob
image_files = glob.glob(os.path.join(dataset_path, "**", "images", "*.jpg"), recursive=True) + \
              glob.glob(os.path.join(dataset_path, "**", "images", "*.jpeg"), recursive=True) + \
              glob.glob(os.path.join(dataset_path, "**", "images", "*.png"), recursive=True)
print("Total image files found:", len(image_files))
label_files = glob.glob(os.path.join(dataset_path, "**", "labels", "*.txt"), recursive=True)
print("Total label files found:", len(label_files))

Total image files found: 2814
Total label files found: 2801


In [None]:
yaml_content = """train: /content/drive/MyDrive/Infosys/css-data/train/images
val: /content/drive/MyDrive/Infosys/css-data/valid/images
test: /content/drive/MyDrive/Infosys/css-data/test/images
nc: 3
names: ["person", "helmet", "vest"]
"""
yaml_path = "/content/drive/MyDrive/Infosys/css-data/css-data.yaml"
with open(yaml_path, "w") as f:
    f.write(yaml_content)
print("YAML saved inside dataset at:", yaml_path)


YAML saved inside dataset at: /content/drive/MyDrive/Infosys/css-data/css-data.yaml


In [None]:
!ls "/content/drive/MyDrive/Infosys/css-data"

css-data.yaml  README.dataset.txt  README.roboflow.txt	test  train  valid


In [None]:
import os
import shutil
from sklearn.model_selection import train_test_split
dataset_path = "/content/drive/MyDrive/Infosys/css-data"
source_folder_name = "train"
local_source_base = "/tmp/css-data-local-source"
local_output_base = "/tmp/traintestsplit"
local_original_images_path = os.path.join(local_source_base, source_folder_name, "images")
local_original_labels_path = os.path.join(local_source_base, source_folder_name, "labels")
drive_target_path = os.path.join(dataset_path, "traintestsplit_FAST")
train_ratio = 0.7
test_ratio = 0.2
val_ratio = 0.1
print("Stage 1/3: Copying source data from Google Drive to local disk...")
drive_source_path = os.path.join(dataset_path, source_folder_name)
if os.path.exists(local_source_base):
    shutil.rmtree(local_source_base)
shutil.copytree(drive_source_path, os.path.join(local_source_base, source_folder_name))
print("Source data copied locally. Starting split (This will be fast)...")
for split in ["train", "test", "val"]:
    for sub in ["images", "labels"]:
        split_path = os.path.join(local_output_base, split, sub)
        os.makedirs(split_path, exist_ok=True)
image_files = [f for f in os.listdir(local_original_images_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
labels = []
for img in image_files:
    label_file = os.path.join(local_original_labels_path, img.rsplit('.', 1)[0] + ".txt")
    class_id = -1
    if os.path.exists(label_file):
        with open(label_file, "r") as f:
            lines = f.readlines()
            if lines:
                try:
                    class_id = int(lines[0].split()[0])
                except (ValueError, IndexError):
                    pass
    labels.append(class_id)
labeled_image_files = [img for img, label in zip(image_files, labels) if label != -1]
labeled_labels = [label for label in labels if label != -1]
if labeled_image_files:
    train_imgs, temp_imgs, train_labels, temp_labels = train_test_split(
        labeled_image_files, labeled_labels, stratify=labeled_labels, test_size=(1-train_ratio), random_state=42
    )
    test_size = test_ratio / (test_ratio + val_ratio)
    val_imgs, test_imgs, val_labels, test_labels = train_test_split(
        temp_imgs, temp_labels, stratify=temp_labels, test_size=test_size, random_state=42
    )
else:
    train_imgs, test_imgs, val_imgs = [], [], []
    print("No labeled images found to perform the split.")
print("Stage 2/3: Performing file copy on local disk...")
def copy_files(img_list, split_name):
    for img in img_list:
        img_src = os.path.join(local_original_images_path, img)
        label_src = os.path.join(local_original_labels_path, img.rsplit('.', 1)[0] + ".txt")
        img_dst = os.path.join(local_output_base, split_name, "images", img)
        label_dst = os.path.join(local_output_base, split_name, "labels", img.rsplit('.', 1)[0] + ".txt")
        shutil.copy2(img_src, img_dst)
        if os.path.exists(label_src):
            shutil.copy2(label_src, label_dst)
if train_imgs: copy_files(train_imgs, "train")
if test_imgs: copy_files(test_imgs, "test")
if val_imgs: copy_files(val_imgs, "val")
print("Local splitting complete.")
print("Stage 3/3: Copying final split result back to Google Drive...")
if os.path.exists(drive_target_path):
    shutil.rmtree(drive_target_path)
if os.path.exists(local_output_base):
    shutil.copytree(local_output_base, drive_target_path)
print("Final split successfully saved to Google Drive.")
def count_images_in_dir(directory):
    if os.path.exists(directory):
        return len([f for f in os.listdir(directory) if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
    return 0
final_train_count = count_images_in_dir(os.path.join(local_output_base, "train", "images"))
final_test_count = count_images_in_dir(os.path.join(local_output_base, "test", "images"))
final_val_count = count_images_in_dir(os.path.join(local_output_base, "val", "images"))
print("\nStratified dataset split completed! (Execution time drastically reduced)")
print(f"Final split is located on Google Drive at: {drive_target_path}")
print("-" * 40)
print("File Counts:")
print(f"Train Images: {final_train_count}")
print(f"Test Images:  {final_test_count}")
print(f"Val Images:   {final_val_count}")
print("-" * 40)
print(f"Total Labeled Images Split: {final_train_count + final_test_count + final_val_count}")

Stage 1/3: Copying source data from Google Drive to local disk...
Source data copied locally. Starting split (This will be fast)...
Stage 2/3: Performing file copy on local disk...
Local splitting complete.
Stage 3/3: Copying final split result back to Google Drive...
Final split successfully saved to Google Drive.

ðŸŽ‰ Stratified dataset split completed! (Execution time drastically reduced) ðŸŽ‰
Final split is located on Google Drive at: /content/drive/MyDrive/Infosys/css-data/traintestsplit_FAST
----------------------------------------
File Counts:
Train Images: 1819
Test Images:  520
Val Images:   260
----------------------------------------
Total Labeled Images Split: 2599
