# Set Environment

In [1]:
import numpy as np
import tensorflow as tf
import os

import pickle
import numpy as np

from sklearn.utils import shuffle

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


# Set Seed for Reproducibility

In [2]:
def set_seed(seed=21019):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed()

# Load data

In [3]:
print('Loading data ...')
with open('/kaggle/input/revised-data-array/data_array.pkl', 'rb') as f:
    loaded_data = pickle.load(f)

X = loaded_data['X']
y = loaded_data['y']

print(f'Samples in Dataset: {len(y)}')

# Define class names
class_names = ['Unidentifiable', 'Type 1', 'Type 2', 'Type 3']

# Define the input shape
input_shape = X[0].shape

# Number of classes
num_classes = len(class_names)

# Class-wise sample count in the dataset
class_counts = np.bincount(y)
for class_idx, count in enumerate(class_counts):
    print(f'Class {class_names[class_idx]}: {count} samples')

Loading data ...
Samples in Dataset: 8108
Class Unidentifiable: 2845 samples
Class Type 1: 1087 samples
Class Type 2: 2822 samples
Class Type 3: 1354 samples


# Create Folds

In [4]:
# Set the seed for reproducibility
set_seed()

# Define the number of splits
n_splits = 5

# Shuffle the dataset while maintaining class distribution
X_shuffled, y_shuffled = shuffle(X, y, random_state=21019)

# Initialize lists for splits
X_splits = [[] for _ in range(n_splits)]
y_splits = [[] for _ in range(n_splits)]

# Sort data by class
unique_classes = np.unique(y)
for cls in unique_classes:
    cls_indices = np.where(y_shuffled == cls)[0]
    np.random.shuffle(cls_indices)
    split_cls_indices = np.array_split(cls_indices, n_splits)
    for i in range(n_splits):
        X_splits[i].extend(X_shuffled[split_cls_indices[i]])
        y_splits[i].extend(y_shuffled[split_cls_indices[i]])

# Convert lists to numpy arrays
X_splits = [np.array(X_split) for X_split in X_splits]
y_splits = [np.array(y_split) for y_split in y_splits]

# Unpack splits for convenience
X1, X2, X3, X4, X5 = X_splits
y1, y2, y3, y4, y5 = y_splits

# Print summary of splits
for i, (X_split, y_split) in enumerate(zip(X_splits, y_splits), start=1):
    print(f"Split {i}:")
    print(f"  X shape: {X_split.shape}")
    print(f"  y shape: {y_split.shape}")
    print(f"  y class distribution: {np.bincount(y_split)}")

Split 1:
  X shape: (1623, 256, 256, 3)
  y shape: (1623,)
  y class distribution: [569 218 565 271]
Split 2:
  X shape: (1623, 256, 256, 3)
  y shape: (1623,)
  y class distribution: [569 218 565 271]
Split 3:
  X shape: (1621, 256, 256, 3)
  y shape: (1621,)
  y class distribution: [569 217 564 271]
Split 4:
  X shape: (1621, 256, 256, 3)
  y shape: (1621,)
  y class distribution: [569 217 564 271]
Split 5:
  X shape: (1620, 256, 256, 3)
  y shape: (1620,)
  y class distribution: [569 217 564 270]


# Save Folds

In [5]:
# Prepare data to save
folds = {
    'X1': X1,
    'X2': X2,
    'X3': X3,
    'X4': X4,
    'X5': X5,
    'y1': y1,
    'y2': y2,
    'y3': y3,
    'y4': y4,
    'y5': y5
}

# Save data splits using pickle
print("Saving data splits ...")
with open('folds.pkl', 'wb') as file:
    pickle.dump(folds, file)

print("Data splits saved successfully.")

Saving data splits ...
Data splits saved successfully.
