# Data Preprocssing Notebook
- Download Kaggle Dataset and use Mediapipe to preprocess images
- Save features and landmarks

## Configurations and Imports

In [None]:
import os
import sys
import numpy as np

# If not already accessible
sys.path.append('src')

# Import utility function
from data_utils import extract_keypoints

In [None]:
# --- Configuration (Adjust the Kaggle path as needed) ---
KAGGLE_DATASET_ID = "grassknoted/asl-alphabet" 
DESTINATION_PATH = "sample_data"
PROCESSED_OUTPUT_DIR = 'processed_data'
DATA_ROOT_FOLDER_NAME = 'asl_alphabet_train' # Common folder name after unzipping

os.makedirs(DESTINATION_PATH, exist_ok=True)
os.makedirs(PROCESSED_OUTPUT_DIR, exist_ok=True)

## Download Data via Kaggle API

In [None]:
print(f"Downloading dataset: {KAGGLE_DATASET_ID}")
!kaggle datasets download -d {KAGGLE_DATASET_ID} -p {DESTINATION_PATH} --unzip

# Define the exact root path to the image subfolders (A, B, C, etc.)
DATA_ROOT = os.path.join(DESTINATION_PATH, DATA_ROOT_FOLDER_NAME)
print(f"Image data root set to: {DATA_ROOT}")

## Feature Extraction and Array Storage

In [None]:
GCS_BUCKET_NAME = "gs://your-handsign-recognition-bucket" 
GCS_DESTINATION_FOLDER = "processed_features_v1"

def create_and_save_features():
    X_keypoints, X_cnn, y_labels = [], [], []
    
    # Iterate through all class folders
    for label_index, class_name in enumerate(sorted(os.listdir(DATA_ROOT))):
        class_path = os.path.join(DATA_ROOT, class_name)
        if not os.path.isdir(class_path) or class_name.startswith('.'):
            continue

        print(f"Processing Class: {class_name} (Label: {label_index})")
        
        for image_name in os.listdir(class_path):
            image_path = os.path.join(class_path, image_name)
            
            # Use the imported modular function
            keypoints, resized_img = extract_keypoints(image_path)
            
            if keypoints is not None:
                # Store keypoints
                X_keypoints.append(keypoints)
                # Normalize and store CNN image data
                X_cnn.append(cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB) / 255.0) 
                y_labels.append(label_index)

    # Convert to final NumPy arrays
    X_keypoints_array = np.array(X_keypoints, dtype=np.float32)
    X_cnn_array = np.array(X_cnn, dtype=np.float32)
    y_labels_array = np.array(y_labels, dtype=np.int32)

    TEMP_DIR = 'temp_feature_dump'
    os.makedirs(TEMP_DIR, exist_ok=True)

    # Save the processed data to the designated output folder
    np.save(os.path.join(TEMP_DIR, 'X_keypoints.npy'), X_keypoints_array)
    np.save(os.path.join(TEMP_DIR, 'X_cnn_images.npy'), X_cnn_array)
    np.save(os.path.join(TEMP_DIR, 'y_labels.npy'), y_labels_array)
    
    # Source is the local temp directory. Destination is the GCS path.
    GCS_PATH = f"{GCS_BUCKET_NAME}/{GCS_DESTINATION_FOLDER}"
    print(f"\nUploading processed features to {GCS_PATH}...")
    
    # The -m flag runs the command multi-threaded (faster) and -r copies the directory recursively
    !gsutil -m cp -r {TEMP_DIR} {GCS_PATH}
    
    print("\nUpload to GCS complete. Features are ready for training notebook.")

    print(f"\nSuccessfully processed {len(X_keypoints)} samples.")
    print(f"Keypoints Shape: {X_keypoints_array.shape}, Images Shape: {X_cnn_array.shape}")
    print("All processed arrays saved to the 'processed_data' directory.")

# --- EXECUTION ---
create_and_save_features()