In [1]:
# imports
import os
import shutil
import cv2
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [13]:
# Paths to the original folders
bike_folder = os.path.join('dataset', 'Bike')
car_folder = os.path.join('dataset', 'Car')
# Get the list of images in each folder
bike_images = os.listdir(bike_folder)
car_images = os.listdir(car_folder)

# Create labels based on Folder name - Bike is 0 and car is 1
def read_images_and_labels(path, label):
    images = []
    labels = []
    for image in os.listdir(path):
        img = cv2.imread(os.path.join(path, image))
        img = cv2.resize(img, (300, 200))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        images.append(img)
        labels.append(label)  # Add the label for this image

    images = np.array(images)
    labels = np.array(labels)
    return images, labels

# Example to read images and labels for Bike and Car
bike_images, bike_labels = read_images_and_labels('dataset/Bike', 0)
car_images, car_labels = read_images_and_labels('dataset/Car', 1)

# Concatenate images and labels
images = np.concatenate((car_images, bike_images))
labels = np.concatenate((car_labels, bike_labels))


In [14]:
# Divide to train and test
# Split into train and test (80% train, 20% test)
train_images, test_images, train_labels, test_labels = train_test_split(images, labels, test_size=0.2, random_state=42)

# divide to train, validation and test
# Split into train (80%), temp (20%)
train_images_val, temp_images, train_labels_val, temp_labels = train_test_split(images, labels, test_size=0.2, random_state=42)

# Split temp into validation (50% of temp) and test (50% of temp)
val_images, test_images_val, val_labels, test_labels_val = train_test_split(temp_images, temp_labels, test_size=0.5, random_state=42)
print("Dataset split into train, val, and test completed!")


print("Train labels distribution:", np.bincount(train_labels))
print("Train labels in val distribution:", np.bincount(train_labels_val))
print("Val labels distribution:", np.bincount(val_labels))
print("Test labels distribution:", np.bincount(test_labels))
print("Test labels in val distribution:", np.bincount(test_labels_val))


Dataset split into train, val, and test completed!
Train labels distribution: [1622 1578]
Train labels in val distribution: [1622 1578]
Val labels distribution: [191 209]
Test labels distribution: [378 422]
Test labels in val distribution: [187 213]
