In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/pesmod-data/pesmod_new/labels/frame0452.txt
/kaggle/input/pesmod-data/pesmod_new/labels/frame3797.txt
/kaggle/input/pesmod-data/pesmod_new/labels/frame0957.txt
/kaggle/input/pesmod-data/pesmod_new/labels/frame2896.txt
/kaggle/input/pesmod-data/pesmod_new/labels/frame0587.txt
/kaggle/input/pesmod-data/pesmod_new/labels/frame3973.txt
/kaggle/input/pesmod-data/pesmod_new/labels/frame1534.txt
/kaggle/input/pesmod-data/pesmod_new/labels/frame2006.txt
/kaggle/input/pesmod-data/pesmod_new/labels/frame1313.txt
/kaggle/input/pesmod-data/pesmod_new/labels/frame3757.txt
/kaggle/input/pesmod-data/pesmod_new/labels/frame2709.txt
/kaggle/input/pesmod-data/pesmod_new/labels/frame1580.txt
/kaggle/input/pesmod-data/pesmod_new/labels/frame3353.txt
/kaggle/input/pesmod-data/pesmod_new/labels/frame0869.txt
/kaggle/input/pesmod-data/pesmod_new/labels/frame2675.txt
/kaggle/input/pesmod-data/pesmod_new/labels/frame2852.txt
/kaggle/input/pesmod-data/pesmod_new/labels/frame0431.txt
/kaggle/input/

In [5]:
import os
import torch
from torch.utils.data import Dataset, random_split, DataLoader
from torchvision import transforms
from PIL import Image

# Custom Dataset Class

In [6]:
class CustomDataset(Dataset):
    def __init__(self, images_dir, labels_dir, transform=None):
        self.images_dir = images_dir
        self.labels_dir = labels_dir
        self.transform = transform
        self.image_files = [f for f in os.listdir(images_dir) if f.endswith('.jpg')]
        self.label_files = [f.replace('.jpg', '.txt') for f in self.image_files]

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.images_dir, self.image_files[idx])
        label_path = os.path.join(self.labels_dir, self.label_files[idx])

        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        with open(label_path, 'r') as f:
            label = f.read().strip()  # Adjust if labels need additional processing

        return image, label, self.image_files[idx]  # Return filename as well

# Directory

In [7]:
images_dir = '/kaggle/input/pesmod-data/pesmod_new/images'
labels_dir = '/kaggle/input/pesmod-data/pesmod_new/labels'
transform = transforms.Compose([transforms.ToTensor()])

# Initialize dataset and calculate sizes

In [8]:
full_dataset = CustomDataset(images_dir=images_dir, labels_dir=labels_dir, transform=transform)
train_size = int(0.7 * len(full_dataset))
test_size = int(0.2 * len(full_dataset))
val_size = len(full_dataset) - train_size - test_size

# Split dataset

In [9]:
train_dataset, test_dataset, val_dataset = random_split(full_dataset, [train_size, test_size, val_size])

In [10]:
print(f"Train size: {len(train_dataset)}, Test size: {len(test_dataset)}, Validation size: {len(val_dataset)}")

Train size: 2874, Test size: 821, Validation size: 412


# Create directories to save the split datasets

In [11]:
base_dir = '/kaggle/working/split_dataset'
os.makedirs(os.path.join(base_dir, 'train/images'), exist_ok=True)
os.makedirs(os.path.join(base_dir, 'train/labels'), exist_ok=True)
os.makedirs(os.path.join(base_dir, 'val/images'), exist_ok=True)
os.makedirs(os.path.join(base_dir, 'val/labels'), exist_ok=True)
os.makedirs(os.path.join(base_dir, 'test/images'), exist_ok=True)
os.makedirs(os.path.join(base_dir, 'test/labels'), exist_ok=True)

# Function to save datasets

In [12]:
def save_split_dataset(dataset, data_type):
    for img, label, filename in dataset:
        # Save the image (convert tensor to PIL Image first)
        img_pil = transforms.ToPILImage()(img)  # Convert tensor to PIL Image
        img_file_name = filename
        label_file_name = img_file_name.replace('.jpg', '.txt')

        # Save the image
        img_pil.save(os.path.join(base_dir, data_type, 'images', img_file_name))
        # Save the label
        with open(os.path.join(base_dir, data_type, 'labels', label_file_name), 'w') as f:
            f.write(label)

# Save train, validation, and test datasets

In [13]:
save_split_dataset(train_dataset, 'train')
save_split_dataset(val_dataset, 'val')
save_split_dataset(test_dataset, 'test')

In [14]:
print(f"Train size: {len(train_dataset)}, Test size: {len(test_dataset)}, Validation size: {len(val_dataset)}")

Train size: 2874, Test size: 821, Validation size: 412


# DataLoaders for training, validation, and test sets

In [15]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Check sizes for verification

In [16]:
print(f"Train size: {len(train_dataset)}, Test size: {len(test_dataset)}, Validation size: {len(val_dataset)}")
print("Datasets saved in /kaggle/working/split_dataset")

Train size: 2874, Test size: 821, Validation size: 412
Datasets saved in /kaggle/working/split_dataset


# zip and download the split dataset

In [17]:
import shutil

# Set the path to your split dataset
split_dataset_dir = '/kaggle/working/split_dataset'

# Compress the split dataset folder into a zip file
shutil.make_archive('/kaggle/working/split_dataset', 'zip', split_dataset_dir)

'/kaggle/working/split_dataset.zip'

In [18]:
print("Download Split Dataset")

Download Split Dataset


In [None]:
print(f"Train size: {len(train_dataset)}, Test size: {len(test_dataset)}, Validation size: {len(val_dataset)}")
print("Datasets saved in /kaggle/working/split_dataset")