In [46]:
from google.colab import drive
import os
import random
import pandas as pd
import cv2 as cv
from google.colab.patches import cv2_imshow

In [5]:
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
train_folder = '/content/drive/MyDrive/Welding_Dataset/training'
files = os.listdir(train_folder)
print(files)

['NoDefect', 'Crack', 'Porosity', 'LackOfPenetration']


In [32]:
def stratified_sampling(root_dir, num_samples=500, seed=None):
    """
    Perform stratified sampling of files from subfolders of the root directory.

    Args:
        root_dir (str): The root directory containing subfolders.
        num_samples_per_class (int): The number of samples to take from each subfolder.
        seed (int): Random seed for reproducibility.

    Returns:
        A dictionary containing image URLs as keys and their corresponding class names as values.
    """
    if seed is not None:
        random.seed(seed)

    sampled_data = {}

    for class_name in os.listdir(root_dir):
        class_dir = os.path.join(root_dir, class_name)

        if os.path.isdir(class_dir):
            file_list = os.listdir(class_dir)

            if len(file_list) > num_samples:
                random.shuffle(file_list)
                sampled_files = random.sample(file_list, num_samples)
            else:
                sampled_files = file_list

            class_label = class_name

            for file_name in sampled_files:
                file_url = os.path.join(class_dir, file_name)
                sampled_data[file_url] = class_label

    return sampled_data


# Usage
sampled_data = stratified_sampling(train_folder, num_samples=500, seed=42)

count_per_class = {}

# Count the number of keys per value
for class_label in sampled_data.values():
    count_per_class[class_label] = count_per_class.get(class_label, 0) + 1

# Print the counts
for class_label, count in count_per_class.items():
    print(f'{class_label}: {count} keys')


NoDefect: 500 keys
Crack: 500 keys
Porosity: 500 keys
LackOfPenetration: 500 keys


In [41]:
def load_into_dataframe(data_dict):
  df = pd.DataFrame(list(data_dict.items()), columns=['ID', 'Label'])
  return df

dataframe = load_into_dataframe(sampled_data)

In [None]:
def load_image(url):
  img = cv.imread(url)
  cv2_imshow(img)
  cv.waitKey(0)
  cv.destroyAllWindows()
  return img

img = load_image(dataframe['ID'][0])