<a href="https://colab.research.google.com/github/Thejuskuku/Feature-Engineering/blob/main/Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from tensorflow.keras.datasets import mnist

# Load the MNIST dataset
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

print(f"Shape of train_images: {train_images.shape}")
print(f"Shape of train_labels: {train_labels.shape}")
print(f"Shape of test_images: {test_images.shape}")
print(f"Shape of test_labels: {test_labels.shape}")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Shape of train_images: (60000, 28, 28)
Shape of train_labels: (60000,)
Shape of test_images: (10000, 28, 28)
Shape of test_labels: (10000,)


In [2]:
import numpy as np

# Convert to float32 and normalize pixel values to the range [0, 1]
train_images = train_images.astype('float32') / 255.0
test_images = test_images.astype('float32') / 255.0

# Verify normalization
print(f"Train images data type: {train_images.dtype}")
print(f"Min pixel value in train_images: {np.min(train_images)}")
print(f"Max pixel value in train_images: {np.max(train_images)}")

print(f"\nTest images data type: {test_images.dtype}")
print(f"Min pixel value in test_images: {np.min(test_images)}")
print(f"Max pixel value in test_images: {np.max(test_images)}")

Train images data type: float32
Min pixel value in train_images: 0.0
Max pixel value in train_images: 1.0

Test images data type: float32
Min pixel value in test_images: 0.0
Max pixel value in test_images: 1.0


In [3]:
from tensorflow.keras.utils import to_categorical

# Apply one-hot encoding to the labels
train_labels = to_categorical(train_labels, num_classes=10)
test_labels = to_categorical(test_labels, num_classes=10)

# Verify the new shapes of the one-hot encoded labels
print(f"Shape of one-hot encoded train_labels: {train_labels.shape}")
print(f"Shape of one-hot encoded test_labels: {test_labels.shape}")

Shape of one-hot encoded train_labels: (60000, 10)
Shape of one-hot encoded test_labels: (10000, 10)


In [4]:
from sklearn.model_selection import train_test_split

# Split the training data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(
    train_images, train_labels, test_size=0.1, random_state=42, stratify=train_labels
)

# Print the shapes of the new sets to verify the split
print(f"Shape of x_train: {x_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of x_val: {x_val.shape}")
print(f"Shape of y_val: {y_val.shape}")
print(f"Shape of test_images: {test_images.shape}")
print(f"Shape of test_labels: {test_labels.shape}")

Shape of x_train: (54000, 28, 28)
Shape of y_train: (54000, 10)
Shape of x_val: (6000, 28, 28)
Shape of y_val: (6000, 10)
Shape of test_images: (10000, 28, 28)
Shape of test_labels: (10000, 10)
