In [None]:
# Script to ingest the MNIST dataset and process it from raw to interim to processed stages.

import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load the raw MNIST dataset
mnist = tf.keras.datasets.mnist
(raw_train_images, raw_train_labels), (raw_test_images, raw_test_labels) = mnist.load_data()

# Interim Processing: Normalize pixel values to be between 0 and 1
interim_train_images = raw_train_images / 255.0
interim_test_images = raw_test_images / 255.0

# Processed Data: Flatten the images and scale the pixel values
scaler = MinMaxScaler()
# Flatten the images
processed_train_images = interim_train_images.reshape((-1, 28*28))
processed_test_images = interim_test_images.reshape((-1, 28*28))
# Scale the pixel values
processed_train_images = scaler.fit_transform(processed_train_images)
processed_test_images = scaler.transform(processed_test_images)

# Convert the datasets to pandas DataFrames for further analysis or storage
train_df = pd.DataFrame(processed_train_images)
train_df['label'] = raw_train_labels
test_df = pd.DataFrame(processed_test_images)
test_df['label'] = raw_test_labels

# Save the processed data to CSV files
train_df.to_csv('data/processed/mnist_train_processed.csv', index=False)
test_df.to_csv('data/processed/mnist_test_processed.csv', index=False)


In [None]:
import tensorflow as tf
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load raw data
(mnist_train_images, mnist_train_labels), (mnist_test_images, mnist_test_labels) = tf.keras.datasets.mnist.load_data()

# Normalize (interim processing)
mnist_train_images_norm = mnist_train_images / 255.0
mnist_test_images_norm = mnist_test_images / 255.0

# Flatten and scale (processed data)
scaler = MinMaxScaler()
mnist_train_images_flat = scaler.fit_transform(mnist_train_images_norm.reshape(-1, 28*28))
mnist_test_images_flat = scaler.transform(mnist_test_images_norm.reshape(-1, 28*28))

# Convert to DataFrame
train_df = pd.DataFrame(mnist_train_images_flat, columns=[f'pixel_{i}' for i in range(784)])
train_df['label'] = mnist_train_labels

test_df = pd.DataFrame(mnist_test_images_flat, columns=[f'pixel_{i}' for i in range(784)])
test_df['label'] = mnist_test_labels

# Save to CSV
train_df.to_csv('mnist_train_processed.csv', index=False)
test_df.to_csv('mnist_test_processed.csv', index=False)


In [None]:
# Importing necessary libraries
import numpy as np
from keras.datasets import mnist
import os

# Function to load and save the raw MNIST dataset
def load_and_save_raw_mnist():
    (x_train, y_train), (x_test, y_test) = mnist.load_data()

    # Saving raw data
    np.save(os.path.join('data', 'raw', 'x_train_raw.npy'), x_train)
    np.save(os.path.join('data', 'raw', 'y_train_raw.npy'), y_train)
    np.save(os.path.join('data', 'raw', 'x_test_raw.npy'), x_test)
    np.save(os.path.join('data', 'raw', 'y_test_raw.npy'), y_test)

    return x_train, y_train, x_test, y_test

# Function to preprocess data to an interim stage
def preprocess_to_interim(x_train, x_test):
    # Normalizing the images to the range of [0, 1]
    x_train_interim = x_train.astype("float32") / 255
    x_test_interim = x_test.astype("float32") / 255

    # Saving interim data
    np.save(os.path.join('data', 'interim', 'x_train_interim.npy'), x_train_interim)
    np.save(os.path.join('data', 'interim', 'x_test_interim.npy'), x_test_interim)

    return x_train_interim, x_test_interim

# Function to further process data to the final processed stage
def process_to_final(x_train_interim, y_train, x_test_interim, y_test):
    # Reshaping the data to fit model input
    x_train_processed = x_train_interim.reshape((-1, 28, 28, 1))
    x_test_processed = x_test_interim.reshape((-1, 28, 28, 1))

    # Saving processed data
    np.save(os.path.join('data', 'processed', 'x_train_processed.npy'), x_train_processed)
    np.save(os.path.join('data', 'processed', 'y_train_processed.npy'), y_train)
    np.save(os.path.join('data', 'processed', 'x_test_processed.npy'), x_test_processed)
    np.save(os.path.join('data', 'processed', 'y_test_processed.npy'), y_test)

    return x_train_processed, x_test_processed

# Main function to run the data processing pipeline
def main():
    # Ensure the necessary directories exist
    os.makedirs(os.path.join('data', 'raw'), exist_ok=True)
    os.makedirs(os.path.join('data', 'interim'), exist_ok=True)
    os.makedirs(os.path.join('data', 'processed'), exist_ok=True)

    # Load and save raw data
    x_train, y_train, x_test, y_test = load_and_save_raw_mnist()

    # Preprocess data to interim stage
    x_train_interim, x_test_interim = preprocess_to_interim(x_train, x_test)

    # Process data to final stage
    x_train_processed, x_test_processed = process_to_final(x_train_interim, y_train, x_test_interim, y_test)

    print("Data processing complete. Data saved in 'data' directory.")

if __name__ == "__main__":
    main()
