In [5]:
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Conv2D, BatchNormalization, MaxPooling2D, 
    Dense, Dropout, Reshape, Lambda, Embedding, 
    MultiHeadAttention, LayerNormalization, 
    GlobalAveragePooling1D
)
from tensorflow.keras.models import Model

import os
import time
import logging
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Set up logging
log_dir = '/kaggle/working/'
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, 'hybrid_model_training.log')
logging.basicConfig(
    filename=log_file,
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

class Patches(layers.Layer):
    def __init__(self, patch_size):
        super().__init__()
        self.patch_size = patch_size

    def call(self, images):
        batch_size = tf.shape(images)[0]
        patches = tf.image.extract_patches(
            images=images,
            sizes=[1, self.patch_size, self.patch_size, 1],
            strides=[1, self.patch_size, self.patch_size, 1],
            rates=[1, 1, 1, 1],
            padding="VALID",
        )
        patch_dims = patches.shape[-1]
        patches = tf.reshape(patches, [batch_size, -1, patch_dims])
        return patches

class PatchEncoder(layers.Layer):
    def __init__(self, num_patches, projection_dim):
        super().__init__()
        self.num_patches = num_patches
        self.projection = layers.Dense(units=projection_dim)
        self.position_embedding = layers.Embedding(
            input_dim=num_patches, output_dim=projection_dim
        )

    def call(self, patch):
        positions = tf.range(start=0, limit=self.num_patches, delta=1)
        encoded = self.projection(patch) + self.position_embedding(positions)
        return encoded

def create_hybrid_vit_model(input_shape=(128, 128, 3), patch_size=16, num_patches=64, projection_dim=256, 
                             transformer_layers=4, num_heads=8, dropout_rate=0.1):
    # Input layer
    inputs = Input(shape=input_shape)
    
    # CNN Feature Extraction
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
    x = BatchNormalization()(x)
    x = MaxPooling2D((2, 2))(x)
    
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D((2, 2))(x)
    
    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    
    # Ensure the output is 4D before patch extraction
    x = Conv2D(256, (3, 3), activation='relu', padding='same')(x)
    
    # Patch Extraction Layer
    def extract_patches(x):
        return tf.image.extract_patches(
            images=x,
            sizes=[1, patch_size, patch_size, 1],
            strides=[1, patch_size, patch_size, 1],
            rates=[1, 1, 1, 1],
            padding="VALID"
        )
    
    patches = Lambda(extract_patches)(x)
    
    # Reshape patches
    patch_dims = patches.shape[-1]
    num_patches = (x.shape[1] // patch_size) * (x.shape[2] // patch_size)
    patches = Reshape((-1, patch_dims))(patches)
    
    # Patch Embedding
    x = Dense(projection_dim)(patches)
    
    # Add positional embeddings
    positions = tf.range(start=0, limit=num_patches, delta=1)
    position_embedding = Embedding(input_dim=num_patches, output_dim=projection_dim)(positions)
    x = x + tf.expand_dims(position_embedding, axis=0)
    
    # Transformer Blocks
    for _ in range(transformer_layers):
        # Layer normalization and multi-head attention
        x_norm = LayerNormalization(epsilon=1e-6)(x)
        attention_output = MultiHeadAttention(
            num_heads=num_heads, 
            key_dim=projection_dim // num_heads
        )(x_norm, x_norm)
        x = x + attention_output
        
        # MLP
        mlp_norm = LayerNormalization(epsilon=1e-6)(x)
        mlp = Dense(projection_dim * 2, activation='gelu')(mlp_norm)
        mlp = Dropout(dropout_rate)(mlp)
        mlp = Dense(projection_dim)(mlp)
        x = x + mlp
    
    # Global average pooling and classification
    x = LayerNormalization(epsilon=1e-6)(x)
    x = GlobalAveragePooling1D()(x)
    x = Dropout(dropout_rate)(x)
    
    # Final classification layer
    outputs = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    return model

def load_and_preprocess_data(data_dir):
    """
    Load images and labels from training directory
    Returns X (images), y (labels)
    """
    images = []
    labels = []
    
    for label, class_name in enumerate(['NORMAL', 'PNEUMONIA']):
        class_dir = os.path.join(data_dir, class_name)
        for img_name in os.listdir(class_dir):
            img_path = os.path.join(class_dir, img_name)
            
            # Read and resize image
            img = tf.keras.preprocessing.image.load_img(img_path, target_size=(128, 128))
            img_array = tf.keras.preprocessing.image.img_to_array(img)
            img_array = img_array / 255.0  # Normalize
            
            images.append(img_array)
            labels.append(label)
    
    return np.array(images), np.array(labels)

def create_dynamic_validation_generator(X_train, y_train, batch_size=32):
    """
    Create a dynamic validation generator using train_test_split
    """
    # Split training data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
    )
    
    # Data augmentation for training
    train_datagen = ImageDataGenerator(
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )
    
    # No augmentation for validation
    val_datagen = ImageDataGenerator()
    
    # Create generators
    train_generator = train_datagen.flow(
        X_train, y_train, 
        batch_size=batch_size, 
        shuffle=True
    )
    
    val_generator = val_datagen.flow(
        X_val, y_val, 
        batch_size=batch_size, 
        shuffle=False
    )
    
    return train_generator, val_generator

def train_model(model, train_generator, validation_generator, epochs=30):
    optimizer = tf.keras.optimizers.AdamW(
        learning_rate=3e-4,
        weight_decay=1e-4
    )
    
    model.compile(
        optimizer=optimizer,
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=0.1),
        metrics=["accuracy"],
    )
    
    callbacks = [
        keras.callbacks.EarlyStopping(
            monitor="val_loss",
            patience=5,
            restore_best_weights=True
        ),
        keras.callbacks.ReduceLROnPlateau(
            monitor="val_loss",
            factor=0.2,
            patience=3,
            min_lr=1e-6
        ),
        keras.callbacks.ModelCheckpoint(
            'best_hybrid_model.keras',
            monitor="val_loss",
            save_best_only=True
        ),
        keras.callbacks.CSVLogger(
            os.path.join(log_dir, 'vit_training.csv')
        )
    ]
    
    history = model.fit(
        train_generator,
        validation_data=validation_generator,
        epochs=epochs,
        callbacks=callbacks,
        verbose=1
    )
    
    return history

def main():
    # Load and preprocess training data
    data_dir = '/kaggle/input/chest-x-ray-images/FinalData/train'
    X, y = load_and_preprocess_data(data_dir)
    
    # Create dynamic validation generator
    train_generator, validation_generator = create_dynamic_validation_generator(X, y)
    
    # Create and train model
    vit_model = create_hybrid_vit_model()
    start_time = time.time()
    history = train_model(vit_model, train_generator, validation_generator)
    execution_time = time.time() - start_time

    # Evaluate model
    y_true = validation_generator.y
    y_pred = (vit_model.predict(validation_generator) > 0.5).astype("int32")

    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    # Log and print results
    logging.info(f'Hybrid Model - Accuracy: {accuracy:.4f}')
    logging.info(f'Hybrid Model - Precision: {precision:.4f}')
    logging.info(f'Hybrid Model - Recall: {recall:.4f}')
    logging.info(f'Hybrid Model - F1-score: {f1:.4f}')
    logging.info(f'Hybrid Model - Execution Time: {execution_time:.2f} seconds')

    print(f"Hybrid Model - Accuracy: {accuracy:.4f}")
    print(f"Hybrid Model - Precision: {precision:.4f}")
    print(f"Hybrid Model - Recall: {recall:.4f}")
    print(f"Hybrid Model - F1-score: {f1:.4f}")
    print(f"Hybrid Model - Execution Time: {execution_time:.2f} seconds")

if __name__ == "__main__":
    main()

Epoch 1/30


  self._warn_if_super_not_called()
I0000 00:00:1732724429.467881     145 service.cc:145] XLA service 0x793d7800f960 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1732724429.467940     145 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1732724429.467945     145 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5






[1m  2/433[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m24s[0m 56ms/step - accuracy: 0.6797 - loss: 1.3093   

I0000 00:00:1732724470.677346     145 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m281/433[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m17s[0m 113ms/step - accuracy: 0.7227 - loss: 0.6486




[1m432/433[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 166ms/step - accuracy: 0.7360 - loss: 0.6131




[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 193ms/step - accuracy: 0.7361 - loss: 0.6128 - val_accuracy: 0.7117 - val_loss: 0.8197 - learning_rate: 3.0000e-04
Epoch 2/30
[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 120ms/step - accuracy: 0.8155 - loss: 0.4714 - val_accuracy: 0.8091 - val_loss: 0.5151 - learning_rate: 3.0000e-04
Epoch 3/30
[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 115ms/step - accuracy: 0.8321 - loss: 0.4523 - val_accuracy: 0.6958 - val_loss: 0.6087 - learning_rate: 3.0000e-04
Epoch 4/30
[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 117ms/step - accuracy: 0.8396 - loss: 0.4416 - val_accuracy: 0.6831 - val_loss: 0.6571 - learning_rate: 3.0000e-04
Epoch 5/30
[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 117ms/step - accuracy: 0.8377 - loss: 0.4423 - val_accuracy: 0.7955 - val_loss: 0.5076 - learning_rate: 3.0000e-04
Epoch 6/30
[1m433/433[0m [32m━━━━━━━━━━━━━