# Data Processing

In [None]:
DEBUG = True

if DEBUG:
    print("Debugging Enabled...")

Debugging Enabled...


### Process Data Into Global & Categorized CSVs
- Splits into a main initialCSVLabels.csv
- Splits into 2 sub CSVs:
    - initialLabeledTrain.csv
    - initialLabeledTest.csv



In [None]:
import os
import random
import pandas as pd
from sklearn.model_selection import train_test_split

LANDSLIDE_DIR = "/content/drive/MyDrive/GradProjectFiles/RAW_CCTV_Images/landslides"
NORMAL_DIR = "/content/drive/MyDrive/GradProjectFiles/RAW_CCTV_Images/standard"

CSV_PATH = "/content/drive/MyDrive/GradProjectFiles/initialCSVLabels.csv"
TRAIN_CSV_PATH = "/content/drive/MyDrive/GradProjectFiles/initialLabeledTrain.csv"
TEST_CSV_PATH = "/content/drive/MyDrive/GradProjectFiles/initialLabeledTest.csv"

# Ensure directories exist
os.makedirs(os.path.dirname(CSV_PATH), exist_ok=True)
os.makedirs(os.path.dirname(TRAIN_CSV_PATH), exist_ok=True)
os.makedirs(os.path.dirname(TEST_CSV_PATH), exist_ok=True)

# Create labels
data = []

if DEBUG:
    print("Assigning Normal Labels")
# Assign label 0 for normal road images
for img_name in os.listdir(NORMAL_DIR):
    img_path = os.path.join(NORMAL_DIR, img_name)
    if os.path.isfile(img_path):  # Ensure it's a file
        data.append({"file_path": img_path, "label": 0})

if DEBUG:
    print("Assigning Landslide Labels")
# Assign label 1 for landslide images
for img_name in os.listdir(LANDSLIDE_DIR):
    img_path = os.path.join(LANDSLIDE_DIR, img_name)
    if os.path.isfile(img_path):  # Ensure it's a file
        data.append({"file_path": img_path, "label": 1})

# Save to CSV
df = pd.DataFrame(data)
df.to_csv(CSV_PATH, index=False)

print(f"CSV saved to: {CSV_PATH}")

# Split dataset
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Save splits
train.to_csv(TRAIN_CSV_PATH, index=False)
test.to_csv(TEST_CSV_PATH, index=False)

print(f"Dataset split completed: {TRAIN_CSV_PATH}, {TEST_CSV_PATH}")

Assigning Normal Labels
Assigning Landslide Labels
CSV saved to: /content/drive/MyDrive/GradProjectFiles/initialCSVLabels.csv
Dataset split completed: /content/drive/MyDrive/GradProjectFiles/initialLabeledTrain.csv, /content/drive/MyDrive/GradProjectFiles/initialLabeledTest.csv


### Break data down into NPZ arrays
- Data is broken via batches of 1000s sub-arrays then recombined into single Train, Val, Test arrays

In [None]:
import os
import gc
from PIL import Image
import numpy as np
import pandas as pd
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

CSV_PATH = '/content/drive/MyDrive/GradProjectFiles/initialCSVLabels.csv'
TRAIN_CSV_PATH = '/content/drive/MyDrive/GradProjectFiles/initialLabeledTrain.csv'
TEST_CSV_PATH = '/content/drive/MyDrive/GradProjectFiles/initialLabeledTest.csv'

numpySaves = "/content/drive/MyDrive/GradProjectFiles/NumPyDataSaves"

ImgSize = (320,260)
NumClasses = 2  # Binary classification: normal (0) vs landslide (1)
BatchSize = 128
EpochNum = 50
BatchSize = 1000  # Number of images to process per batch (adjust based on memory capacity)

trainDataFrame = pd.read_csv(TRAIN_CSV_PATH)
testDataFrame = pd.read_csv(TEST_CSV_PATH)

os.makedirs(numpySaves, exist_ok=True)

def loadAndPreprocessImage(path):
    try:
        img = Image.open(path).convert("RGB")  # Load image with PIL
        img = img.resize(ImgSize)
        img = np.array(img, dtype=np.float32) / 255.0  # Normalize
        return img
    except Exception as e:
        print(f"Skipping invalid image: {path} - Error: {e}")
        return None


# Batch Processing Function
def process_and_save_batches(df, dataset_type, batch_size):
    total_images = len(df)
    num_batches = (total_images // batch_size) + (1 if total_images % batch_size != 0 else 0)

    if DEBUG:
        print(f"Processing {dataset_type} dataset in {num_batches} batches of {batch_size} images each...")

    for batch_idx in range(num_batches):
        start_idx = batch_idx * batch_size
        end_idx = min(start_idx + batch_size, total_images)

        batch_images = []
        batch_labels = []

        for i in range(start_idx, end_idx):
            img = loadAndPreprocessImage(df.iloc[i]["file_path"])
            if img is not None:
                batch_images.append(img)
                batch_labels.append(df.iloc[i]["label"])

        if batch_images:
            batch_images = np.array(batch_images)
            batch_labels = np.array(batch_labels)
            batch_labels = to_categorical(batch_labels, num_classes=NumClasses)

            # Print the shape of the data in each batch
            if DEBUG:
                print(f"Shape of {dataset_type} batch {batch_idx+1} images: {batch_images.shape}")
                print(f"Shape of {dataset_type} batch {batch_idx+1} labels: {batch_labels.shape}")

            # Save the batch in compressed format to reduce disk usage
            np.savez_compressed(os.path.join(numpySaves, f"{dataset_type}Images_batch{batch_idx}.npz"), batch_images)
            np.savez_compressed(os.path.join(numpySaves, f"{dataset_type}Labels_batch{batch_idx}.npz"), batch_labels)

            if DEBUG:
                print(f"Saved batch {batch_idx+1}/{num_batches} for {dataset_type}")

            # Free up memory
            del batch_images, batch_labels
            gc.collect()

def recombine_numpy_batches(dataset_type):
    image_batches = sorted([f for f in os.listdir(numpySaves) if f.startswith(f"{dataset_type}Images_batch") and f.endswith(".npz")])
    label_batches = sorted([f for f in os.listdir(numpySaves) if f.startswith(f"{dataset_type}Labels_batch") and f.endswith(".npz")])

    if len(image_batches) == 0 or len(label_batches) == 0:
        print(f"Error: No batch files found for {dataset_type}. Check if they were saved correctly.")
        return

    if DEBUG:
        print(f"Recombining {len(image_batches)} image batches and {len(label_batches)} label batches for {dataset_type} dataset...")

    final_images = []
    final_labels = []

    for i, (img_batch, lbl_batch) in enumerate(zip(image_batches, label_batches)):
        img_path = os.path.join(numpySaves, img_batch)
        lbl_path = os.path.join(numpySaves, lbl_batch)

        # Load compressed numpy arrays properly
        with np.load(img_path) as img_data:
            img_array = img_data["arr_0"]  # Extract the array correctly

        with np.load(lbl_path) as lbl_data:
            lbl_array = lbl_data["arr_0"]  # Extract the array correctly

        if DEBUG:
            print(f"Batch {i+1} - Images: {img_array.shape}, Labels: {lbl_array.shape}")

        final_images.append(img_array)
        final_labels.append(lbl_array)

        # Free memory
        del img_array, lbl_array
        gc.collect()

    # Concatenate along the first dimension (batch size)
    final_images = np.concatenate(final_images, axis=0)
    final_labels = np.concatenate(final_labels, axis=0)

    if DEBUG:
        print(f"Final combined shape - Images: {final_images.shape}, Labels: {final_labels.shape}")

    # Save images and labels separately
    np.savez_compressed(os.path.join(numpySaves, f"{dataset_type}Images.npz"), final_images)
    np.savez_compressed(os.path.join(numpySaves, f"{dataset_type}Labels.npz"), final_labels)

    print(f"Successfully recombined and saved {dataset_type} dataset separately:\n"
          f"Images: {final_images.shape}, Labels: {final_labels.shape}")

    # Free memory
    del final_images, final_labels
    gc.collect()

# Process training data in batches
# process_and_save_batches(trainDataFrame, "train", BatchSize)
# Process test data in batches
# process_and_save_batches(testDataFrame, "test", BatchSize)

# Recombine batches into a single file
if DEBUG:
    print("Recombine into single numpy arrays")
recombine_numpy_batches("train")
recombine_numpy_batches("test")

# Load the final `.npz` dataset
trainImages = np.load(os.path.join(numpySaves, "trainImages.npz"))["arr_0"]
trainLabels = np.load(os.path.join(numpySaves, "trainLabels.npz"))["arr_0"]
testImages = np.load(os.path.join(numpySaves, "testImages.npz"))["arr_0"]
testLabels = np.load(os.path.join(numpySaves, "testLabels.npz"))["arr_0"]

# Split training data further into train and validation sets
if DEBUG:
    print("Generating validation images and labels...")
trainImages, valImages, trainLabels, valLabels = train_test_split(trainImages, trainLabels, test_size=0.2, random_state=42)

if DEBUG:
    print("Saving numpy array data...")
np.savez_compressed(os.path.join(numpySaves, "trainImages.npz"), trainImages)
np.savez_compressed(os.path.join(numpySaves, "trainLabels.npz"), trainLabels)
np.savez_compressed(os.path.join(numpySaves, "valImages.npz"), valImages)
np.savez_compressed(os.path.join(numpySaves, "valLabels.npz"), valLabels)
np.savez_compressed(os.path.join(numpySaves, "testImages.npz"), testImages)
np.savez_compressed(os.path.join(numpySaves, "testLabels.npz"), testLabels)

if DEBUG:
    print(f"Train: {trainImages.shape}, {trainLabels.shape}")
    print(f"Validation: {valImages.shape}, {valLabels.shape}")
    print(f"Test: {testImages.shape}, {testLabels.shape}")


Recombine into single numpy arrays
Recombining 33 image batches and 33 label batches for train dataset...
Batch 1 - Images: (999, 260, 320, 3), Labels: (999, 2)
Batch 2 - Images: (999, 260, 320, 3), Labels: (999, 2)
Batch 3 - Images: (999, 260, 320, 3), Labels: (999, 2)
Batch 4 - Images: (1000, 260, 320, 3), Labels: (1000, 2)
Batch 5 - Images: (999, 260, 320, 3), Labels: (999, 2)
Batch 6 - Images: (999, 260, 320, 3), Labels: (999, 2)
Batch 7 - Images: (1000, 260, 320, 3), Labels: (1000, 2)
Batch 8 - Images: (999, 260, 320, 3), Labels: (999, 2)
Batch 9 - Images: (1000, 260, 320, 3), Labels: (1000, 2)
Batch 10 - Images: (999, 260, 320, 3), Labels: (999, 2)
Batch 11 - Images: (1000, 260, 320, 3), Labels: (1000, 2)
Batch 12 - Images: (1000, 260, 320, 3), Labels: (1000, 2)
Batch 13 - Images: (1000, 260, 320, 3), Labels: (1000, 2)
Batch 14 - Images: (1000, 260, 320, 3), Labels: (1000, 2)
Batch 15 - Images: (1000, 260, 320, 3), Labels: (1000, 2)
Batch 16 - Images: (999, 260, 320, 3), Labels: 