In [None]:
print("hello world")

hello world


# Task
Create a hematovision machine learning model.

## Data collection and preprocessing

### Subtask:
Gather a dataset of hematological images (blood smears, etc.) and preprocess them (e.g., normalization, resizing, augmentation). This may involve obtaining labeled data for different cell types or abnormalities.


**Reasoning**:
The first step is to identify a suitable dataset. A common public dataset for this purpose is the Acute Lymphoblastic Leukemia Image Database for Image Processing (ALL-IDB). I will simulate downloading and loading a small subset of this dataset for demonstration purposes. Since I cannot actually download external data, I will create dummy data to represent image file paths and labels. I will then load and preprocess this dummy data.



In [None]:
import numpy as np
import os
from PIL import Image
import random
import tensorflow as tf

# Simulate dataset creation: create dummy image files and labels
# In a real scenario, you would download and extract a dataset like ALL-IDB

# Create a directory for dummy images
dummy_data_dir = 'dummy_hematology_data'
os.makedirs(dummy_data_dir, exist_ok=True)

# Define some dummy labels
dummy_labels = ['lymphocyte', 'neutrophil']
num_images_per_class = 50

# Create dummy image files
image_paths = []
labels = []

for label in dummy_labels:
    label_dir = os.path.join(dummy_data_dir, label)
    os.makedirs(label_dir, exist_ok=True)
    for i in range(num_images_per_class):
        # Create a dummy image (e.g., a simple colored square)
        dummy_image = Image.new('RGB', (128, 128), color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)))
        image_filename = f'{label}_{i:03d}.png'
        image_path = os.path.join(label_dir, image_filename)
        dummy_image.save(image_path)
        image_paths.append(image_path)
        labels.append(label)

# Shuffle the data
combined = list(zip(image_paths, labels))
random.shuffle(combined)
image_paths, labels = zip(*combined)

# Map string labels to integer labels
label_to_int = {label: i for i, label in enumerate(dummy_labels)}
int_labels = [label_to_int[label] for label in labels]

# Load and preprocess images
IMG_HEIGHT = 128
IMG_WIDTH = 128

def load_and_preprocess_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_png(img, channels=3)
    img = tf.image.convert_image_dtype(img, dtype=tf.float32) # Normalize to [0, 1]
    img = tf.image.resize(img, [IMG_HEIGHT, IMG_WIDTH])
    return img

# Create a TensorFlow Dataset
image_dataset = tf.data.Dataset.from_tensor_slices(list(image_paths))
label_dataset = tf.data.Dataset.from_tensor_slices(int_labels)

# Apply preprocessing to the image dataset
image_dataset = image_dataset.map(load_and_preprocess_image)

# Combine image and label datasets
dataset = tf.data.Dataset.zip((image_dataset, label_dataset))

# Display the first element of the dataset to verify
for image, label in dataset.take(1):
    print("Image shape:", image.shape)
    print("Label:", label.numpy())

# Note: Data augmentation would be added here, e.g., using tf.image augmentation functions
# dataset = dataset.map(lambda x, y: (augment_image(x), y)) # Need to define augment_image function

print(f"Created and preprocessed {len(image_paths)} dummy images.")

Image shape: (128, 128, 3)
Label: 0
Created and preprocessed 100 dummy images.


**Reasoning**:
The previous step successfully created dummy data, loaded and preprocessed it (resizing and normalization). Step 4 also mentioned data augmentation, which is a common preprocessing step. I will add a simple data augmentation step using TensorFlow to further preprocess the dataset.



In [None]:
# Implement data augmentation
def augment_image(image, label):
    # Apply random horizontal flip
    image = tf.image.random_flip_left_right(image)
    # Apply random rotation (example, needs TensorFlow Addons or custom implementation)
    # image = tfa.image.rotate(image, tf.random.uniform(shape=[], minval=-np.pi/8, maxval=np.pi/8))
    # Apply random zoom/crop (example)
    image = tf.image.random_crop(image, size=[int(IMG_HEIGHT*0.9), int(IMG_WIDTH*0.9), 3])
    image = tf.image.resize(image, [IMG_HEIGHT, IMG_WIDTH])
    return image, label

# Apply augmentation to the dataset (typically applied only to training data)
# For demonstration, applying to the whole dataset
augmented_dataset = dataset.map(augment_image)

# Display the first element of the augmented dataset to verify
print("Displaying first element of augmented dataset:")
for image, label in augmented_dataset.take(1):
    print("Augmented image shape:", image.shape)
    print("Label:", label.numpy())

Displaying first element of augmented dataset:
Augmented image shape: (128, 128, 3)
Label: 0


## Model selection

### Subtask:
Choose an appropriate machine learning model architecture for image analysis. Convolutional Neural Networks (CNNs) are commonly used for image-based tasks. Consider transfer learning from pre-trained models.


**Reasoning**:
Import necessary modules and define, compile, and summarize a CNN model architecture for image analysis.



In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

# 2. Define a CNN model architecture
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH, 3)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5), # Add dropout for regularization
    Dense(len(dummy_labels), activation='softmax') # Output layer with softmax for classification
])

# 3. Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# 4. Print a summary of the model architecture
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


## Model training

### Subtask:
Train the selected model on the preprocessed dataset. This involves splitting the data into training and validation sets, defining loss functions and optimizers, and monitoring performance during training.


**Reasoning**:
Split the dataset, define batch size, batch and prefetch the datasets, and then train the model.



In [None]:
# 1. Split the dataset into training and validation sets
dataset_size = len(image_paths)
train_size = int(0.8 * dataset_size)
val_size = dataset_size - train_size

train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

# 2. Define a batch size for training
BATCH_SIZE = 32

# 3. Batch and prefetch the training and validation datasets
train_dataset = train_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# 4. Train the compiled model
EPOCHS = 10  # Define the number of epochs

history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=EPOCHS
)

Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 867ms/step - accuracy: 0.5000 - loss: 0.7852 - val_accuracy: 0.5000 - val_loss: 0.8125
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 691ms/step - accuracy: 0.5063 - loss: 0.7707 - val_accuracy: 0.5000 - val_loss: 0.7025
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 723ms/step - accuracy: 0.5727 - loss: 0.6852 - val_accuracy: 0.5000 - val_loss: 0.7036
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1s/step - accuracy: 0.5922 - loss: 0.6719 - val_accuracy: 0.6000 - val_loss: 0.7095
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 765ms/step - accuracy: 0.6062 - loss: 0.6671 - val_accuracy: 0.5000 - val_loss: 0.6907
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 709ms/step - accuracy: 0.6125 - loss: 0.6460 - val_accuracy: 0.5000 - val_loss: 0.6896
Epoch 7/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━

## Model evaluation

### Subtask:
Evaluate the trained model's performance using appropriate metrics (e.g., accuracy, precision, recall, F1-score, AUC) on a separate test set.


**Reasoning**:
Evaluate the trained model on the validation dataset (since a separate test set is not available) using the evaluate method and print the results.



In [None]:
# Evaluate the model on the validation dataset
print("Evaluating the model on the validation dataset...")
loss, accuracy = model.evaluate(val_dataset)

# Print the evaluation results
print(f"Validation Loss: {loss:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

Evaluating the model on the validation dataset...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 186ms/step - accuracy: 0.5500 - loss: 0.7289
Validation Loss: 0.7289
Validation Accuracy: 0.5500


## Summary:

### Data Analysis Key Findings

*   A simulated dataset of hematological images with two classes ('lymphocyte', 'neutrophil') was successfully created and preprocessed.
*   The preprocessing included resizing images to 128x128 pixels, normalizing pixel values to \[0, 1], and applying data augmentation techniques such as random horizontal flips and random crops.
*   A Sequential Convolutional Neural Network (CNN) model architecture was defined with `Conv2D`, `MaxPooling2D`, `Flatten`, `Dense`, and `Dropout` layers.
*   The CNN model was compiled using the Adam optimizer and sparse categorical crossentropy loss function, with accuracy as the evaluation metric.
*   The dataset was split into 80% for training and 20% for validation.
*   The model was trained for 10 epochs with a batch size of 32.
*   During training, the validation accuracy fluctuated around 50-60%, and the final evaluation on the validation dataset yielded a loss of 0.7289 and an accuracy of 0.5500.

### Insights or Next Steps

*   The model's performance on the validation set (55% accuracy) is low, indicating a need for model improvement or more extensive data.
*   Obtaining a larger and more diverse labeled dataset of real hematological images is crucial for building a robust model.
