# Data Preprocessing

## Data Cealning

In [7]:
import random
from PIL import Image
import os
import shutil

In [9]:

def resize_images(input_folder, output_folder, size=(640, 640)):
    """
    Resize all images in a folder to a specific size and save them to a new folder.

    Args:
        input_folder (str): Path to the folder containing original images.
        output_folder (str): Path to the folder to save resized images.
        size (tuple): Target size (width, height) for resizing.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        input_path = os.path.join(input_folder, filename)
        
        # Check if it's a valid image file
        try:
            with Image.open(input_path) as img:
                # Resize the image using the new Resampling method
                resized_img = img.resize(size, Image.Resampling.LANCZOS)
                output_path = os.path.join(output_folder, filename)
                resized_img.save(output_path)
                print(f"Resized and saved: {output_path}")
        except Exception as e:
            print(f"Skipping file {filename}: {e}")

# Example usage
input_folder = "Data_fixed/Images"
output_folder = "Data_fixed/Resized"
resize_images(input_folder, output_folder, size=(640, 640))


Resized and saved: Data_fixed/Resized\1_menit_langsung_bisa_bahasa_isyarat_by_benakribo_mp4-0_jpg.rf.096f70ceaefa0d7878ef97dec58ffd29.jpg
Resized and saved: Data_fixed/Resized\1_menit_langsung_bisa_bahasa_isyarat_by_benakribo_mp4-11_jpg.rf.7e551c8e38f308e95492b3b87020a6cf.jpg
Resized and saved: Data_fixed/Resized\1_menit_langsung_bisa_bahasa_isyarat_by_benakribo_mp4-12_jpg.rf.8919a9f63c2d7928d102344cf8080f95.jpg
Resized and saved: Data_fixed/Resized\1_menit_langsung_bisa_bahasa_isyarat_by_benakribo_mp4-13_jpg.rf.6660d404efb02552f8327191e95fecda.jpg
Resized and saved: Data_fixed/Resized\1_menit_langsung_bisa_bahasa_isyarat_by_benakribo_mp4-14_jpg.rf.57ddeb3614f86f0ac2d8fa32c99bf6c2.jpg
Resized and saved: Data_fixed/Resized\1_menit_langsung_bisa_bahasa_isyarat_by_benakribo_mp4-15_jpg.rf.b64a26e418855c5a2ffc2c9d4e234a50.jpg
Resized and saved: Data_fixed/Resized\1_menit_langsung_bisa_bahasa_isyarat_by_benakribo_mp4-16_jpg.rf.8b0057da2884ddfaf818f47bad5b480f.jpg
Resized and saved: Data_fixe

In [10]:
def check_image_sizes(folder, target_size=(640, 640)):
    """
    Check if all images in a folder have the specified size.

    Args:
        folder (str): Path to the folder containing images.
        target_size (tuple): Expected size (width, height) for the images.

    Returns:
        list: A list of images that do not match the target size.
    """
    mismatched_files = []
    
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        try:
            with Image.open(file_path) as img:
                if img.size != target_size:
                    mismatched_files.append((filename, img.size))
        except Exception as e:
            print(f"Skipping file {filename}: {e}")
    
    if mismatched_files:
        print("The following files do not match the target size:")
        for file, size in mismatched_files:
            print(f"{file}: {size}")
    else:
        print("All images are the correct size.")

    return mismatched_files

# Example usage
folder = "Data_fixed/Resized"
check_image_sizes(folder, target_size=(640, 640))

All images are the correct size.


[]

## Data spliting

In [11]:
import os
import shutil
import random

def split_dataset(input_folder, output_folder, train_ratio=0.8, test_ratio=0.1, val_ratio=0.1):
    """
    Split the images in the input folder into train, test, and validation sets.

    Args:
        input_folder (str): Path to the folder containing images.
        output_folder (str): Path to the folder to save the split dataset.
        train_ratio (float): Proportion of the dataset to be used for training.
        test_ratio (float): Proportion of the dataset to be used for testing.
        val_ratio (float): Proportion of the dataset to be used for validation.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Create subfolders for train, test, and validation sets
    train_folder = os.path.join(output_folder, 'train')
    test_folder = os.path.join(output_folder, 'test')
    val_folder = os.path.join(output_folder, 'val')

    for folder in [train_folder, test_folder, val_folder]:
        if not os.path.exists(folder):
            os.makedirs(folder)

    # List all image files in the input folder
    all_files = [f for f in os.listdir(input_folder) if os.path.isfile(os.path.join(input_folder, f))]

    # Shuffle the file list to randomize the split
    random.shuffle(all_files)

    # Calculate the number of files for each subset
    total_files = len(all_files)
    train_size = int(total_files * train_ratio)
    test_size = int(total_files * test_ratio)
    val_size = total_files - train_size - test_size  # Remaining files go to validation

    # Split the files
    train_files = all_files[:train_size]
    test_files = all_files[train_size:train_size + test_size]
    val_files = all_files[train_size + test_size:]

    # Copy the files to the appropriate folder (instead of moving them)
    for file in train_files:
        shutil.copy(os.path.join(input_folder, file), os.path.join(train_folder, file))
    for file in test_files:
        shutil.copy(os.path.join(input_folder, file), os.path.join(test_folder, file))
    for file in val_files:
        shutil.copy(os.path.join(input_folder, file), os.path.join(val_folder, file))

    print(f"Dataset split: {train_size} train, {test_size} test, {val_size} validation")

# Example usage
input_folder = "Data_fixed/Resized"  # Folder containing the resized images
output_folder = "Data_fixed/split"  # Folder where the split dataset will be saved
split_dataset(input_folder, output_folder, train_ratio=0.8, test_ratio=0.1, val_ratio=0.1)


Dataset split: 1941 train, 242 test, 244 validation


## Data Augmentation

In [5]:
import os
import cv2
import numpy as np
import albumentations as A

# Definisikan augmentasi yang akan digunakan
augmentation = A.Compose([
    A.HorizontalFlip(p=0.5),  # Horizontal flip dengan probabilitas 50%
    A.RandomScale(scale_limit=(0.0, 0.3), p=0.3),  # Zoom hingga 30% dengan ukuran tetap
    A.Rotate(limit=15, p=0.5),  # Rotasi antara -15° hingga +15° dengan probabilitas 50%
    A.Affine(shear={"x": (-25, 25), "y": (-15, 15)}, p=0.5),  # Shear horizontal ±25° dan vertical ±15°
    A.ToGray(p=0.2),  # Ubah menjadi grayscale untuk 20% gambar
    A.HueSaturationValue(
        hue_shift_limit=25,
        sat_shift_limit=25,
        val_shift_limit=25,
        p=0.5
    ),  # Hue ±25°, Saturation ±25%, Brightness ±25%
    A.MotionBlur(blur_limit=3, p=0.2),  # Blur hingga 1.5px
    A.GaussNoise(var_limit=(0, 0.07*255), p=0.2),  # Noise hingga 7% piksel
    A.Resize(height=640, width=640)  # Resize ke ukuran tetap 640x640
])

# Folder input dan output
input_folder = './Data_fixed/split/train'  # Folder gambar yang akan di-augmentasi
output_folder = './Data_fixed/split/train_augmented'  # Folder untuk menyimpan gambar hasil augmentasi
os.makedirs(output_folder, exist_ok=True)

def augment_image(image_path, output_path, num_augmentations=2):
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Unable to read image {image_path}")
        return

    # Salin gambar asli ke folder output
    original_image_filename = os.path.join(output_path, f"original_{os.path.basename(image_path)}")
    cv2.imwrite(original_image_filename, image)
    print(f"Saved original image to: {original_image_filename}")
    
    # Lakukan augmentasi sebanyak num_augmentations kali
    for i in range(num_augmentations):
        augmented = augmentation(image=image)
        augmented_image = augmented['image']
        
        # Menyimpan gambar yang telah di-augmentasi
        output_filename = os.path.join(output_path, f"aug_{i}_{os.path.basename(image_path)}")
        cv2.imwrite(output_filename, augmented_image)
        print(f"Saved augmented image to: {output_filename}")

# Proses augmentasi untuk semua gambar dalam folder
for image_name in os.listdir(input_folder):
    image_path = os.path.join(input_folder, image_name)
    
    if os.path.isfile(image_path):
        augment_image(image_path, output_folder)


  A.GaussNoise(var_limit=(0, 0.07*255), p=0.2),  # Noise hingga 7% piksel


Saved original image to: ./Data_fixed/split/train_augmented\original_1_menit_langsung_bisa_bahasa_isyarat_by_benakribo_mp4-0_jpg.rf.096f70ceaefa0d7878ef97dec58ffd29.jpg
Saved augmented image to: ./Data_fixed/split/train_augmented\aug_0_1_menit_langsung_bisa_bahasa_isyarat_by_benakribo_mp4-0_jpg.rf.096f70ceaefa0d7878ef97dec58ffd29.jpg
Saved augmented image to: ./Data_fixed/split/train_augmented\aug_1_1_menit_langsung_bisa_bahasa_isyarat_by_benakribo_mp4-0_jpg.rf.096f70ceaefa0d7878ef97dec58ffd29.jpg
Saved original image to: ./Data_fixed/split/train_augmented\original_1_menit_langsung_bisa_bahasa_isyarat_by_benakribo_mp4-12_jpg.rf.8919a9f63c2d7928d102344cf8080f95.jpg
Saved augmented image to: ./Data_fixed/split/train_augmented\aug_0_1_menit_langsung_bisa_bahasa_isyarat_by_benakribo_mp4-12_jpg.rf.8919a9f63c2d7928d102344cf8080f95.jpg
Saved augmented image to: ./Data_fixed/split/train_augmented\aug_1_1_menit_langsung_bisa_bahasa_isyarat_by_benakribo_mp4-12_jpg.rf.8919a9f63c2d7928d102344cf808