# Classification using Deep Learning with Histogram data

---

### Reading the data

First, we’ll load the saved image and label data from the NumPy files.

In [None]:
import numpy as np
import time

notebook_start_time = time.time()

# Load training data
train_images = np.load('data/train_images.npy')
train_labels = np.load('data/train_labels.npy')
val_images = np.load('data/val_images.npy')
val_labels = np.load('data/val_labels.npy')

In [2]:
import numpy as np

# Display image information
print("=== Image Characteristics ===")
print(f"Shape of training set: {train_images.shape} (Samples, Height, Width, Channels)")
print(f"Shape of validation set: {val_images.shape} (Samples, Height, Width, Channels)")

# Image resolution (height and width)
image_resolution = train_images.shape[1:3]  # Taking height and width dimensions
print(f"Image resolution: {image_resolution} pixels")

# Total number of images
print(f"Number of images in training set: {train_images.shape[0]}")
print(f"Number of images in validation set: {val_images.shape[0]}")

# Data type of the images
print(f"Data type of images: {train_images.dtype}")

# Range of values in the images
print(f"Minimum and maximum values in training dataset: {train_images.min()} to {train_images.max()}")

# Display some unique labels and their counts in the training set
unique_labels_train, counts_train = np.unique(train_labels, return_counts=True)
print("\n=== Class Distribution in Training Set ===")
for label, count in zip(unique_labels_train, counts_train):
    print(f"Class {label}: {count} samples")

# Display some unique labels and their counts in the validation set
unique_labels_val, counts_val = np.unique(val_labels, return_counts=True)
print("\n=== Class Distribution in Validation Set ===")
for label, count in zip(unique_labels_val, counts_val):
    print(f"Class {label}: {count} samples")

=== Image Characteristics ===
Shape of training set: (9711, 512, 512, 3) (Samples, Height, Width, Channels)
Shape of validation set: (3237, 512, 512, 3) (Samples, Height, Width, Channels)
Image resolution: (512, 512) pixels
Number of images in training set: 9711
Number of images in validation set: 3237
Data type of images: uint8
Minimum and maximum values in training dataset: 0 to 255

=== Class Distribution in Training Set ===
Class 0: 2149 samples
Class 1: 635 samples
Class 2: 1186 samples
Class 3: 2140 samples
Class 4: 3601 samples

=== Class Distribution in Validation Set ===
Class 0: 727 samples
Class 1: 222 samples
Class 2: 421 samples
Class 3: 721 samples
Class 4: 1146 samples


## Combining training and validation sets

In [3]:
import numpy as np
import os

# Create directory to save the combined data
cached_data_dir = 'cached_data'
os.makedirs(cached_data_dir, exist_ok=True)

# File paths for saved combined data
all_images_path = os.path.join(cached_data_dir, 'all_images.npy')
all_labels_path = os.path.join(cached_data_dir, 'all_labels.npy')

# Check if combined files already exist
if os.path.exists(all_images_path) and os.path.exists(all_labels_path):
    print("Combined files already exist. Loading from disk...")
    all_images = np.load(all_images_path)
    all_labels = np.load(all_labels_path)
else:
    print("Combined files do not exist. Concatenating data...")
    # Concatenate the training and validation sets
    all_images = np.concatenate((train_images, val_images), axis=0)
    all_labels = np.concatenate((train_labels, val_labels), axis=0)

    # Save the combined files
    np.save(all_images_path, all_images)
    np.save(all_labels_path, all_labels)
    print("Data concatenated and saved.")

# Display image information
print("=== Image Characteristics ===")
print(f"Shape of combined dataset: {all_images.shape} (Samples, Height, Width, Channels)")
print(f"Image resolution: {all_images.shape[1:3]} pixels")
print(f"Total number of images: {all_images.shape[0]}")
print(f"Data type of images: {all_images.dtype}")
print(f"Minimum and maximum values in combined dataset: {all_images.min()} to {all_images.max()}")

# Display class distribution
unique_labels, counts = np.unique(all_labels, return_counts=True)
print("\n=== Class Distribution in Combined Dataset ===")
for label, count in zip(unique_labels, counts):
    print(f"Class {label}: {count} samples")

print(f"\nData saved in: {cached_data_dir}")

Combined files already exist. Loading from disk...
=== Image Characteristics ===
Shape of combined dataset: (12948, 512, 512, 3) (Samples, Height, Width, Channels)
Image resolution: (512, 512) pixels
Total number of images: 12948
Data type of images: uint8
Minimum and maximum values in combined dataset: 0 to 255

=== Class Distribution in Combined Dataset ===
Class 0: 2876 samples
Class 1: 857 samples
Class 2: 1607 samples
Class 3: 2861 samples
Class 4: 4747 samples

Data saved in: cached_data


In [4]:
import gc
del train_images, train_labels, val_images, val_labels
gc.collect()

22

## Applying SMOTE for class balancing

In [5]:
import numpy as np
import os
from imblearn.over_sampling import SMOTE
from sklearn.utils import shuffle

# Directories
smote_dir = os.path.join(cached_data_dir, "SMOTE")
os.makedirs(smote_dir, exist_ok=True)

# File paths for saved SMOTE data
smote_images_path = os.path.join(smote_dir, "all_images_smote.npy")
smote_labels_path = os.path.join(smote_dir, "all_labels_smote.npy")

# Check if SMOTE files already exist
if os.path.exists(smote_images_path) and os.path.exists(smote_labels_path):
    print("SMOTE files already exist. Loading from disk...")
    X_resampled = np.load(smote_images_path)
    y_resampled = np.load(smote_labels_path)
else:
    print("SMOTE files do not exist. Applying SMOTE...")
    # Reshape to 2D format required for SMOTE
    num_samples, height, width, channels = all_images.shape
    all_images_reshaped = all_images.reshape(num_samples, -1)  # Flatten images to (num_samples, features)

    # Apply SMOTE to balance the classes
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X_resampled, y_resampled = smote.fit_resample(all_images_reshaped, all_labels)

    # Reshape back to the original format (height, width, channels)
    X_resampled = X_resampled.reshape(-1, height, width, channels)

    # Shuffle the data to avoid sequential patterns
    X_resampled, y_resampled = shuffle(X_resampled, y_resampled, random_state=42)

    # Save the new balanced files
    np.save(smote_images_path, X_resampled)
    np.save(smote_labels_path, y_resampled)
    print("SMOTE applied and files saved.")

# Display the distribution of the new classes
unique_labels, counts = np.unique(y_resampled, return_counts=True)
print("\n=== Class Distribution after SMOTE ===")
for label, count in zip(unique_labels, counts):
    print(f"Class {label}: {count} samples")

print(f"\nBalanced data saved in: {smote_dir}")

SMOTE files already exist. Loading from disk...

=== Class Distribution after SMOTE ===
Class 0: 4747 samples
Class 1: 4747 samples
Class 2: 4747 samples
Class 3: 4747 samples
Class 4: 4747 samples

Balanced data saved in: cached_data/SMOTE


## Splitting combined set in proportion 80% for training and 20% for validation

In [6]:
import numpy as np
import os
from sklearn.model_selection import train_test_split

# Source and destination directories
train_dir = os.path.join(smote_dir, "training")
valid_dir = os.path.join(smote_dir, "validation")

# Create output directories
os.makedirs(train_dir, exist_ok=True)
os.makedirs(valid_dir, exist_ok=True)

# File paths for saved split data
train_images_path = os.path.join(train_dir, "train_images.npy")
train_labels_path = os.path.join(train_dir, "train_labels.npy")
val_images_path = os.path.join(valid_dir, "val_images.npy")
val_labels_path = os.path.join(valid_dir, "val_labels.npy")

# Check if split files already exist
if (os.path.exists(train_images_path) and os.path.exists(train_labels_path) and
    os.path.exists(val_images_path) and os.path.exists(val_labels_path)):
    print("Split files already exist. Loading from disk...")
    train_images_final = np.load(train_images_path)
    train_labels_final = np.load(train_labels_path)
    valid_images_final = np.load(val_images_path)
    valid_labels_final = np.load(val_labels_path)
else:
    print("Split files do not exist. Splitting data...")
    # Separate data by class and split into 80% training, 20% validation
    train_images_list, valid_images_list = [], []
    train_labels_list, valid_labels_list = [], []

    unique_labels = np.unique(y_resampled)  # Use the resampled labels

    for label in unique_labels:
        # Filter images and labels for the current class
        indices = np.where(y_resampled == label)[0]
        images_class = X_resampled[indices]  # Use the resampled images
        labels_class = y_resampled[indices]  # Use the resampled labels
        
        # Split into training (80%) and validation (20%)
        train_images, valid_images, train_labels, valid_labels = train_test_split(
            images_class, labels_class, test_size=0.2, random_state=42, stratify=labels_class
        )
        
        # Store the separated data
        train_images_list.append(train_images)
        train_labels_list.append(train_labels)
        valid_images_list.append(valid_images)
        valid_labels_list.append(valid_labels)

    # Concatenate data from all classes
    train_images_final = np.concatenate(train_images_list, axis=0)
    train_labels_final = np.concatenate(train_labels_list, axis=0)
    valid_images_final = np.concatenate(valid_images_list, axis=0)
    valid_labels_final = np.concatenate(valid_labels_list, axis=0)

    # Save the split files
    np.save(train_images_path, train_images_final)
    np.save(train_labels_path, train_labels_final)
    np.save(val_images_path, valid_images_final)
    np.save(val_labels_path, valid_labels_final)
    print("Data split and saved.")

# Display final class distribution
unique_train, count_train = np.unique(train_labels_final, return_counts=True)
unique_valid, count_valid = np.unique(valid_labels_final, return_counts=True)

print("\n=== Class Distribution in Training Set ===")
for label, count in zip(unique_train, count_train):
    print(f"Class {label}: {count} samples")

print("\n=== Class Distribution in Validation Set ===")
for label, count in zip(unique_valid, count_valid):
    print(f"Class {label}: {count} samples")

print(f"\nData saved in:\nTraining -> {train_dir}\nValidation -> {valid_dir}")

Split files already exist. Loading from disk...

=== Class Distribution in Training Set ===
Class 0: 3797 samples
Class 1: 3797 samples
Class 2: 3797 samples
Class 3: 3797 samples
Class 4: 3797 samples

=== Class Distribution in Validation Set ===
Class 0: 950 samples
Class 1: 950 samples
Class 2: 950 samples
Class 3: 950 samples
Class 4: 950 samples

Data saved in:
Training -> cached_data/SMOTE/training
Validation -> cached_data/SMOTE/validation


In [7]:
import numpy as np

# Load the training data
train_images = np.load(os.path.join(train_dir, 'train_images.npy'))
train_labels = np.load(os.path.join(train_dir, 'train_labels.npy'))
val_images = np.load(os.path.join(valid_dir, 'val_images.npy'))
val_labels = np.load(os.path.join(valid_dir, 'val_labels.npy'))

# File path for saved bin edges
bin_edges_path = os.path.join(cached_data_dir, 'bin_edges.npy')

# Check if bin edges file already exists
if os.path.exists(bin_edges_path):
    print("Bin edges file already exists. Loading from disk...")
    bin_edges = np.load(bin_edges_path, allow_pickle=True)
else:
    print("Bin edges file does not exist. Computing quantiles...")
    
    # Define quantile levels (e.g., deciles for 10 bins)
    quantile_levels = np.linspace(0, 1, num=11)  # 0.0, 0.1, ..., 1.0
    bin_edges = []

    # Compute quantiles for each channel (R, G, B)
    for channel in range(3):
        channel_pixels = train_images[:, :, :, channel].flatten()
        edges = np.quantile(channel_pixels, quantile_levels)
        bin_edges.append(edges)

    # Save the computed bin edges
    np.save(bin_edges_path, bin_edges)
    print("Quantiles computed and bin edges saved.")

# Display the bin edges (optional)
print("\n=== Bin Edges ===")
for channel, edges in enumerate(bin_edges):
    print(f"Channel {channel}: {edges}")

Bin edges file already exists. Loading from disk...

=== Bin Edges ===
Channel 0: [  0.   0.   2.   3.   4.   5.   7.  26. 144. 253. 255.]
Channel 1: [  0.   0.   1.   2.   2.   3.   5.  11.  56. 125. 255.]
Channel 2: [  0.   0.   1.   2.   3.   3.   5.  12.  60. 123. 255.]


In [8]:
def image_to_histogram(image, bin_edges):
    features = []
    for channel in range(3):
        pixels = image[:, :, channel].flatten()
        hist, _ = np.histogram(pixels, bins=bin_edges[channel])
        hist = hist / len(pixels)  # Normalize to proportions
        features.extend(hist)
    return np.array(features)

# Convert training images to histograms
train_histograms = np.array([image_to_histogram(img, bin_edges) for img in train_images])
print("Created histograms from images")

# Free memory
del train_images
gc.collect()
print("train_images removed from memory.")

Created histograms from images
train_images removed from memory.


---

### Train CubeSatNet DNN model

We will define and train a Dense Neural Network (DNN) model.

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense

model = Sequential([
    Dense(128, activation='relu', input_shape=(train_histograms.shape[1],)),
    Dense(64, activation='relu'),
    Dense(5, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(
    train_histograms, to_categorical(train_labels, 5),
    epochs=30,
    batch_size=32,
    callbacks=[EarlyStopping(monitor='accuracy', patience=3)]
)

2025-03-22 04:08:24.240378: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-22 04:08:24.519307: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-22 04:08:24.621816: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-22 04:08:24.821834: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-22 04:08:24.865772: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-22 04:08:25.143257: I tensorflow/core/platform/cpu_feature_gu

Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 952us/step - accuracy: 0.8421 - loss: 0.6291
Epoch 2/30
[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9535 - loss: 0.1246
Epoch 3/30
[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 846us/step - accuracy: 0.9633 - loss: 0.1010
Epoch 4/30
[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 869us/step - accuracy: 0.9694 - loss: 0.0830
Epoch 5/30
[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 871us/step - accuracy: 0.9716 - loss: 0.0785
Epoch 6/30
[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 874us/step - accuracy: 0.9728 - loss: 0.0720
Epoch 7/30
[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 915us/step - accuracy: 0.9767 - loss: 0.0625
Epoch 8/30
[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 875us/step - accuracy: 0.9808 - loss: 0.0550
Epoch 9/30
[1m594/594[0m [32m━━━━━

##### **Saving the DNN model**

In [10]:
import pickle

with open('models/dnn_histogram_model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [11]:
import gc

del train_labels
gc.collect()
print("train_labels removed from memory.")

train_labels removed from memory.


---

### Deep learning: Validation set results

In [12]:
#TODO: evaluate loading the raw or SMOTEd data
val_images = np.load('data/val_images.npy')
val_labels = np.load('data/val_labels.npy')
val_labels = to_categorical(val_labels, num_classes=5)

In [13]:
with open('models/dnn_histogram_model.pkl', 'rb') as file:
    cnn_loaded_model = pickle.load(file)
    
# Load precomputed bin edges
bin_edges = np.load('cached_data/bin_edges.npy')

# Convert validation images to histograms
val_images = np.load('data/val_images.npy')
val_histograms = np.array([image_to_histogram(img, bin_edges) for img in val_images])

# Free memory
del val_images
gc.collect()

# Now use histograms for prediction
val_predictions = cnn_loaded_model.predict(val_histograms)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 875us/step


In [14]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Ensure val_labels is in the correct format (1D array of class indices)
# If val_labels is one-hot encoded, convert it to class indices
if val_labels.ndim > 1:  # Check if val_labels is one-hot encoded (2D array)
    val_labels = np.argmax(val_labels, axis=1)

# Ensure val_predictions is in the correct format (1D array of class indices)
# If val_predictions is a 2D array of probabilities, convert it to class indices
if val_predictions.ndim > 1:  # Check if val_predictions is a 2D array
    val_predictions = np.argmax(val_predictions, axis=1)

# Verify shapes
print(val_predictions.shape)  # Should be (3237,)
print(val_labels.shape)       # Should be (3237,)

# Detailed classification report
print("\nClassification Report:")
print(classification_report(val_labels, val_predictions))

(3237,)
(3237,)

Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       727
           1       1.00      1.00      1.00       222
           2       1.00      1.00      1.00       421
           3       1.00      0.95      0.98       721
           4       1.00      1.00      1.00      1146

    accuracy                           0.99      3237
   macro avg       0.99      0.99      0.99      3237
weighted avg       0.99      0.99      0.99      3237



In [15]:
import plotly.figure_factory as ff
import numpy as np
from sklearn.metrics import confusion_matrix

# Define class names
class_names = ["Blurry", "Corrupt", "Missing_Data", "Noisy", "Priority"]

# Compute the confusion matrix
cm = confusion_matrix(val_labels, val_predictions)

# Create the heatmap
fig = ff.create_annotated_heatmap(
    z=cm, 
    x=class_names, 
    y=class_names, 
    colorscale="Blues",
    showscale=True
)

# Update layout for better readability
fig.update_layout(
    title="Confusion Matrix with Class Names",
    xaxis=dict(title="Predicted Label"),
    yaxis=dict(title="True Label")
)

# Show the figure
fig.show()

---

## Testing

In [16]:
import numpy as np
from keras.utils import to_categorical
from sklearn.metrics import classification_report
import pickle
import gc
from source.evaluate import evaluate_pipeline

#TODO
def preprocessing_fn_DNN(X_test_raw):
    """
    TODO: implement whole preprocessing
    """
    print(f"preprocessing_fn_DNN.X_test_raw: {X_test_raw}")

def test_models():
    # Load test data
    test_images = np.load('data/test_images.npy')
    test_labels = np.load('data/test_labels.npy')
    true_classes = test_labels  # Save original labels for reporting

    # Test Histogram Model
    print("\nTesting Histogram-Based Model:")

    # Load histogram model and bin edges
    with open('models/dnn_histogram_model.pkl', 'rb') as f:
        hist_model = pickle.load(f)
    
    bin_edges = np.load('cached_data/bin_edges.npy')
    
    # Convert test images to histograms
    test_histograms = np.array([image_to_histogram(img, bin_edges) for img in test_images])
    
    # Predict with histograms
    hist_pred_probs = hist_model.predict(test_histograms)
    hist_pred_classes = np.argmax(hist_pred_probs, axis=1)
    
    # Generate report
    print("Histogram Model Report:")
    print(classification_report(true_classes, hist_pred_classes))

    #metrics = evaluate_pipeline(hist_model, test_images, test_labels, preprocessing_fn_DNN)
    

test_models()


Testing Histogram-Based Model:
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 918us/step
Histogram Model Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98       668
           1       1.00      1.00      1.00       213
           2       1.00      1.00      1.00       414
           3       1.00      0.96      0.98       721
           4       1.00      1.00      1.00      1221

    accuracy                           0.99      3237
   macro avg       0.99      0.99      0.99      3237
weighted avg       0.99      0.99      0.99      3237



---

In [None]:
notebook_end_time = time.time()

elapsed_time = notebook_end_time - notebook_start_time

print(f"Total elapsed time: {elapsed_time:.6f} seconds")

Total elapsed time: 841.425561 seconds
