# Rock-Paper-Scissors Dataset Preprocessing

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import cv2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical
import pickle
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

2025-07-04 22:28:00.365551: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 1. CONFIGURATION AND SETUP

In [2]:
print("\n1. CONFIGURATION AND SETUP")
print("-" * 35)

# Define configuration
CONFIG = {
    'data_path': '../data',
    'output_path': '../processed_data',
    'classes': ['rock', 'paper', 'scissors'],
    'img_size': (128, 128),  # Resize to 128x128 for efficiency
    'batch_size': 32,
    'test_size': 0.2,
    'val_size': 0.2,  # 20% of training data for validation
    'random_state': 42,
    'normalize': True,
    'augmentation': True
}

# Create output directory
os.makedirs(CONFIG['output_path'], exist_ok=True)

print(f"Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")


1. CONFIGURATION AND SETUP
-----------------------------------
Configuration:
  data_path: ../data
  output_path: ../processed_data
  classes: ['rock', 'paper', 'scissors']
  img_size: (128, 128)
  batch_size: 32
  test_size: 0.2
  val_size: 0.2
  random_state: 42
  normalize: True
  augmentation: True


## 2. DATA LOADING AND INITIAL PROCESSING

In [3]:
print("\n2. DATA LOADING AND INITIAL PROCESSING")
print("-" * 45)

def load_dataset(data_path, classes, img_size):
    """
    Load all images from the dataset
    
    Returns:
        X: numpy array of images
        y: numpy array of labels
        label_mapping: dictionary mapping class names to indices
    """
    images = []
    labels = []
    label_mapping = {class_name: idx for idx, class_name in enumerate(classes)}
    
    print(f"Loading images with target size: {img_size}")
    
    for class_name in classes:
        class_path = os.path.join(data_path, class_name)
        if not os.path.exists(class_path):
            print(f"Warning: Path {class_path} does not exist")
            continue
            
        files = [f for f in os.listdir(class_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        print(f"Loading {len(files)} images from {class_name}...")
        
        for i, file_name in enumerate(files):
            file_path = os.path.join(class_path, file_name)
            try:
                # Load and resize image
                img = Image.open(file_path)
                img = img.convert('RGB')  # Ensure RGB format
                img = img.resize(img_size, Image.LANCZOS)  # High-quality resizing
                
                # Convert to numpy array
                img_array = np.array(img, dtype=np.float32)
                
                images.append(img_array)
                labels.append(label_mapping[class_name])
                
                # Progress indicator
                if (i + 1) % 100 == 0:
                    print(f"  Processed {i + 1}/{len(files)} images")
                    
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
                continue
    
    # Convert to numpy arrays
    X = np.array(images, dtype=np.float32)
    y = np.array(labels)
    
    print(f"\nDataset loaded successfully:")
    print(f"  Images shape: {X.shape}")
    print(f"  Labels shape: {y.shape}")
    print(f"  Label mapping: {label_mapping}")
    
    return X, y, label_mapping

# Load the dataset
X, y, label_mapping = load_dataset(CONFIG['data_path'], CONFIG['classes'], CONFIG['img_size'])


2. DATA LOADING AND INITIAL PROCESSING
---------------------------------------------
Loading images with target size: (128, 128)
Loading 726 images from rock...
  Processed 100/726 images
  Processed 200/726 images
  Processed 300/726 images
  Processed 400/726 images
  Processed 500/726 images
  Processed 600/726 images
  Processed 700/726 images
Loading 712 images from paper...
  Processed 100/712 images
  Processed 200/712 images
  Processed 300/712 images
  Processed 400/712 images
  Processed 500/712 images
  Processed 600/712 images
  Processed 700/712 images
Loading 750 images from scissors...
  Processed 100/750 images
  Processed 200/750 images
  Processed 300/750 images
  Processed 400/750 images
  Processed 500/750 images
  Processed 600/750 images
  Processed 700/750 images

Dataset loaded successfully:
  Images shape: (2188, 128, 128, 3)
  Labels shape: (2188,)
  Label mapping: {'rock': 0, 'paper': 1, 'scissors': 2}


## 3. DATA NORMALIZATION

In [4]:
print("\n3. DATA NORMALIZATION")
print("-" * 25)

def normalize_images(X):
    """Normalize pixel values to [0, 1] range"""
    print(f"Original pixel value range: [{X.min():.2f}, {X.max():.2f}]")
    
    # Normalize to [0, 1]
    X_normalized = X / 255.0
    
    print(f"Normalized pixel value range: [{X_normalized.min():.2f}, {X_normalized.max():.2f}]")
    
    return X_normalized

if CONFIG['normalize']:
    X_normalized = normalize_images(X)
else:
    X_normalized = X.copy()


3. DATA NORMALIZATION
-------------------------
Original pixel value range: [0.00, 255.00]
Normalized pixel value range: [0.00, 1.00]
