In [1]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import os
from tensorflow.keras.callbacks import LearningRateScheduler, ModelCheckpoint
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.resnet50 import preprocess_input
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [2]:
import os
import shutil
import random
from sklearn.model_selection import train_test_split


random_seed = 42
random.seed(random_seed)


source_folder = '/kaggle/input/microscopic-peripheral-blood-cell-images/PBC_dataset_normal_DIB'
destination_folder = 'data'
train_ratio = 0.01 
test_size = 4000   

os.makedirs(os.path.join(destination_folder, 'train'), exist_ok=True)
os.makedirs(os.path.join(destination_folder, 'val'), exist_ok=True)
os.makedirs(os.path.join(destination_folder, 'test'), exist_ok=True)

total_images = []
class_images = {}

for class_name in os.listdir(source_folder):
    class_folder = os.path.join(source_folder, class_name)
    
    if os.path.isdir(class_folder):
        images = os.listdir(class_folder)
        total_images.extend([(class_name, img) for img in images])
        class_images[class_name] = images

random.shuffle(total_images)

train_count = max(1, int(len(total_images) * train_ratio))
train_images = total_images[:train_count]
remaining_images = total_images[train_count:]

test_images = remaining_images[:test_size]
val_images = remaining_images[test_size:]

train_class_images = {class_name: [] for class_name in class_images.keys()}
test_class_images = {class_name: [] for class_name in class_images.keys()}
val_class_images = {class_name: [] for class_name in class_images.keys()}

for class_name, img in train_images:
    train_class_images[class_name].append(img)

for class_name, img in test_images:
    test_class_images[class_name].append(img)

for class_name, img in val_images:
    val_class_images[class_name].append(img)

for class_name, images in train_class_images.items():
    class_folder = os.path.join(source_folder, class_name)
    train_dest = os.path.join(destination_folder, 'train', class_name)
    os.makedirs(train_dest, exist_ok=True)
    for img in images:
        shutil.copy(os.path.join(class_folder, img), os.path.join(train_dest, img))

for class_name, images in val_class_images.items():
    class_folder = os.path.join(source_folder, class_name)
    val_dest = os.path.join(destination_folder, 'val', class_name)
    os.makedirs(val_dest, exist_ok=True)
    for img in images:
        shutil.copy(os.path.join(class_folder, img), os.path.join(val_dest, img))

for class_name, images in test_class_images.items():
    class_folder = os.path.join(source_folder, class_name)
    test_dest = os.path.join(destination_folder, 'test', class_name)
    os.makedirs(test_dest, exist_ok=True)
    for img in images:
        shutil.copy(os.path.join(class_folder, img), os.path.join(test_dest, img))

print("Data successfully split into train (1%), val, and test (4000 images) folders with seed", random_seed)

Data successfully split into train (1%), val, and test (4000 images) folders with seed 42


In [3]:
import os

def filter_invalid_files(directory):
    for subdir, dirs, files in os.walk(directory):
        for file in files:
            if file.startswith('.'):
                print(f"Removing invalid file: {file}")
                os.remove(os.path.join(subdir, file))

filter_invalid_files('/kaggle/working/data/train')
filter_invalid_files('/kaggle/working/data/test')
filter_invalid_files('/kaggle/working/data/val')

Removing invalid file: .DS_169665.jpg


In [4]:
train_dir = '/kaggle/working/data/train'
val_dir = '/kaggle/working/data/val'
test_dir = '/kaggle/working/data/test'


img_height, img_width = 224, 224
batch_size = 32


train_datagen = ImageDataGenerator(
    rescale=1./255,
    zoom_range=0.2,
    horizontal_flip=True
)

val_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='categorical'
)

val_generator = val_datagen.flow_from_directory(
    val_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='categorical'
)

test_generator = test_datagen.flow_from_directory(
    test_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='categorical'
)


Found 170 images belonging to 8 classes.
Found 12922 images belonging to 8 classes.
Found 4000 images belonging to 8 classes.


In [5]:
from tensorflow.keras.applications import MobileNetV2

base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

def extract_features(generator, base_model):
    features = base_model.predict(generator)
    labels = generator.classes
    return features, labels

train_features, train_labels = extract_features(train_generator, base_model)
val_features, val_labels = extract_features(val_generator, base_model)
test_features, test_labels = extract_features(test_generator, base_model)

# Flatten the features for traditional classifiers
train_features = train_features.reshape(train_features.shape[0], -1)
val_features = val_features.reshape(val_features.shape[0], -1)
test_features = test_features.reshape(test_features.shape[0], -1)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


  self._warn_if_super_not_called()
I0000 00:00:1726234171.318574      90 service.cc:145] XLA service 0x7a2f80003590 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1726234171.318627      90 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1726234171.318633      90 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5


[1m1/6[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m34s[0m 7s/step

I0000 00:00:1726234176.607150      90 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1s/step
[1m404/404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 69ms/step
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 57ms/step


In [6]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

base_learners = [
    ('svm', SVC(probability=True)),
    ('knn', KNeighborsClassifier()),
    ('rf', RandomForestClassifier()),
#     ('xgboost', XGBClassifier(eval_metric='mlogloss', use_label_encoder=False))
]


stack_model = StackingClassifier(
    estimators=base_learners, 
    final_estimator=LogisticRegression()
)

stack_model.fit(train_features, train_labels)

test_predictions = stack_model.predict(test_features)
test_accuracy = accuracy_score(test_labels, test_predictions)

print(f"Test Accuracy: {test_accuracy:.4f}")
print(classification_report(test_labels, test_predictions))


Test Accuracy: 0.1810
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       271
           1       0.17      0.21      0.19       732
           2       0.00      0.00      0.00       363
           3       0.17      0.17      0.17       691
           4       0.00      0.00      0.00       266
           5       0.00      0.00      0.00       350
           6       0.19      0.59      0.29       757
           7       0.00      0.00      0.00       570

    accuracy                           0.18      4000
   macro avg       0.07      0.12      0.08      4000
weighted avg       0.10      0.18      0.12      4000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
