In [3]:
import os
import subprocess
import shutil
from pathlib import Path

In [4]:
def download_kaggle_dataset(dataset_name, output_path):
    """Downloads a Kaggle dataset."""
    os.makedirs(output_path, exist_ok=True)
    kaggle_path = "/opt/anaconda3/envs/my-env/bin/kaggle"  # Replace with your kaggle executable path
    command = f"{kaggle_path} datasets download -d {dataset_name} -p {output_path} --unzip"
    subprocess.run(command, shell=True, check=True)
    print(f"Dataset downloaded to: {output_path}")

In [6]:
dataset_name = "grassknoted/asl-alphabet"
output_path = "./Datasets/asl-alphabet"
download_kaggle_dataset(dataset_name, output_path)

Dataset URL: https://www.kaggle.com/datasets/grassknoted/asl-alphabet
License(s): GPL-2.0
Downloading asl-alphabet.zip to ./Datasets/asl-alphabet


100%|█████████▉| 1.02G/1.03G [00:47<00:00, 22.4MB/s]




100%|██████████| 1.03G/1.03G [00:48<00:00, 22.9MB/s]


Dataset downloaded to: ./Datasets/asl-alphabet


In [8]:
dataset_name2 = "prathumarikeri/american-sign-language-09az"
output_path2 = "./Datasets/american-sign-language-09az"
download_kaggle_dataset(dataset_name2, output_path2)

Dataset URL: https://www.kaggle.com/datasets/prathumarikeri/american-sign-language-09az
License(s): CC-BY-SA-4.0
Downloading american-sign-language-09az.zip to ./Datasets/american-sign-language-09az


100%|██████████| 993M/993M [00:43<00:00, 23.9MB/s] 



Dataset downloaded to: ./Datasets/american-sign-language-09az


In [8]:
# Define paths
base_path = "Datasets/asl-alphabet/"
train_path = Path(base_path) / "asl_alphabet_train" / "asl_alphabet_train"
test_path = Path(base_path) / "asl_alphabet_test" / "asl_alphabet_test"
output_path = "Datasets/Processed_asl_alphabet"

# Create output directories
output_train_path = Path(output_path) / "train"
output_test_path = Path(output_path) / "test"
output_train_path.mkdir(parents=True, exist_ok=True)
output_test_path.mkdir(parents=True, exist_ok=True)

In [9]:
# List of folders to keep (A-Z only)
valid_classes = [chr(i) for i in range(ord('A'), ord('Z') + 1)]

# Process training data
for cls in valid_classes:
    cls_train_path = train_path / cls
    cls_output_train_path = output_train_path / cls
    cls_output_test_path = output_test_path / cls

    cls_output_train_path.mkdir(parents=True, exist_ok=True)
    cls_output_test_path.mkdir(parents=True, exist_ok=True)

    # Get all images in the class folder
    images = list(cls_train_path.glob("*.jpg"))
    
    # Shuffle and split into train and test sets
    split_index = int(0.8 * len(images))
    train_images = images[:split_index]
    test_images = images[split_index:]

    # Move files to respective directories
    for img in train_images:
        shutil.copy(img, cls_output_train_path / img.name)
    for img in test_images:
        shutil.copy(img, cls_output_test_path / img.name)

# Process test data (adding remaining single images for validation)
for cls in valid_classes:
    test_img_path = test_path / f"{cls}.jpg"
    if test_img_path.exists():
        shutil.copy(test_img_path, output_test_path / cls / test_img_path.name)

print(f"Dataset preprocessing complete! Train and test data organized in {output_path}.")


Dataset preprocessing complete! Train and test data organized in Datasets/Processed_asl_alphabet.
