# Set Environment

In [None]:
import os
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd

import cv2
import random
import glob
from sklearn.model_selection import train_test_split
import pickle

# Set Seed for Reproducibility

In [None]:
def set_seed(seed=21019):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    #os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed()

# Checking if shapes are consistent

In [None]:
train_dir = ['/kaggle/input/intel-mobileodt-cervical-cancer-screening/train/train/Type_1',
             '/kaggle/input/intel-mobileodt-cervical-cancer-screening/train/train/Type_2',
             '/kaggle/input/intel-mobileodt-cervical-cancer-screening/train/train/Type_3',
             '/kaggle/input/intel-mobileodt-cervical-cancer-screening/additional_Type_1_v2/Type_1',
             '/kaggle/input/intel-mobileodt-cervical-cancer-screening/additional_Type_2_v2/Type_2',
             '/kaggle/input/intel-mobileodt-cervical-cancer-screening/additional_Type_3_v2/Type_3']

df = pd.DataFrame(columns = ['train_dir', 'no_samples', 'random_sample', 'image_shape'])
for path in train_dir:
    files = os.listdir(path)
    no_samples = len(files)
    random_file = random.choice(files)
    image = cv2.imread(os.path.join(path,random_file))
    image_shape = image.shape
    df = df.append({'train_dir':path,
                    'no_samples':no_samples,
                    'random_sample':random_file,
                    'image_shape':image_shape},
                   ignore_index = True)
    
df

**Image shape is inconsistent; needs to be reshaped**

# See Random Samples

In [None]:
plt.figure(figsize=(18,15))
for i in range(len(df)):
    image = cv2.imread(os.path.join(df['train_dir'][i],df['random_sample'][i]))
    plt.subplot(2, 3, i+1)
    plt.imshow(image)
    
plt.show()

# Generate Dataset

In [None]:
warnings.filterwarnings("ignore")

jpg = '*.jpg'
type1 = glob.glob(os.path.join(df['train_dir'][0],jpg))
type2 = glob.glob(os.path.join(df['train_dir'][1],jpg))
type3 = glob.glob(os.path.join(df['train_dir'][2],jpg))
v2_type1 = glob.glob(os.path.join(df['train_dir'][3],jpg))
v2_type2 = glob.glob(os.path.join(df['train_dir'][4],jpg))
v2_type3 = glob.glob(os.path.join(df['train_dir'][5],jpg))

data = []
labels = []
class_names = list(['Type 1','Type 2','Type 3'])
image_size = (227, 227)

for i in type1:   
    image = cv2.imread(i)
    try:
        image = cv2.resize(image, image_size, interpolation=cv2.INTER_AREA)
    except:
        break
    data.append(image)
    labels.append(0)
    
for i in type2:   
    image = cv2.imread(i)
    try:
        image = cv2.resize(image, image_size, interpolation=cv2.INTER_AREA)
    except:
        break
    data.append(image)
    labels.append(1)
    
for i in type3:   
    image = cv2.imread(i)
    try:
        image = cv2.resize(image, image_size, interpolation=cv2.INTER_AREA)
    except:
        break
    data.append(image)
    labels.append(2)

for i in v2_type1:   
    image = cv2.imread(i)
    try:
        image = cv2.resize(image, image_size, interpolation=cv2.INTER_AREA)
    except:
        break
    data.append(image)
    labels.append(0)
    
for i in v2_type2:   
    image = cv2.imread(i)
    try:
        image = cv2.resize(image, image_size, interpolation=cv2.INTER_AREA)
    except:
        break
    data.append(image)
    labels.append(1)
    
for i in v2_type3:   
    image = cv2.imread(i)
    try:
        image = cv2.resize(image, image_size, interpolation=cv2.INTER_AREA)
    except:
        break
    data.append(image)
    labels.append(2)
    
X = np.array(data)
y = np.array(labels)

In [None]:
# Check for consistency in data and label shapes and file counts
print(f'Dataset Shape: {X.shape}, Labels Shape: {y.shape}')
print(f'Type 1 Samples: {np.count_nonzero(y==0)}')
print(f'Type 2 Samples: {np.count_nonzero(y==1)}')
print(f'Type 3 Samples: {np.count_nonzero(y==2)}')

# Train-Validation-Test Split

In [None]:
# Perform train-test split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

# Perform train-validation split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=19)
# Change random_state value for a different split

print(f'Samples in Train Set: {len(y_train)}')
print(f'Samples in Validation Set: {len(y_val)}')
print(f'Samples in Test Set: {len(y_test)}')

# Save split sets for reproduction

In [None]:
splits = {
    'x_train': x_train,
    'x_val': x_val,
    'x_test': x_test,
    'y_train': y_train,
    'y_val': y_val,
    'y_test': y_test
}

with open('data_splits_for_cyenet.pkl', 'wb') as f:
    pickle.dump(splits, f)

# Code for loading data in your notebook from the saved pickle file

In [None]:
with open('data_splits_for_cyenet.pkl', 'rb') as f:
    loaded_splits = pickle.load(f)

x_train = loaded_splits['x_train']
x_val = loaded_splits['x_val']
x_test = loaded_splits['x_test']
y_train = loaded_splits['y_train']
y_val = loaded_splits['y_val']
y_test = loaded_splits['y_test']