In [None]:
import os
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model
from tqdm.notebook import tqdm  # Progress bar for notebooks

# Configuration
TARGET_SIZE = (224, 224)
BATCH_SIZE = 16  # Reduced batch size for memory safety
AUTOTUNE = tf.data.AUTOTUNE

# Custom preprocessing function
def preprocess_image(image_path):
    # Load image
    image = tf.io.read_file(image_path)
    image = tf.image.decode_image(image, channels=3, expand_animations=False)
    image = tf.image.resize(image, TARGET_SIZE)
    
    # Convert to LAB
    lab_image = tf.numpy_function(
        lambda rgb: cv2.cvtColor(rgb.numpy(), cv2.COLOR_RGB2LAB),
        [image],
        tf.uint8
    )
    
    # Apply CLAHE
    l_channel = tf.cast(lab_image[..., 0], tf.uint8)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    l_channel_clahe = tf.numpy_function(
        lambda l: clahe.apply(l.numpy()),
        [l_channel],
        tf.uint8
    )
    
    # Merge channels back
    lab_image_clahe = tf.stack([
        l_channel_clahe,
        lab_image[..., 1],
        lab_image[..., 2]
    ], axis=-1)
    
    # Normalize
    lab_image_clahe = tf.cast(lab_image_clahe, tf.float32)
    lab_image_clahe = tf.stack([
        lab_image_clahe[..., 0] / 100.0,
        (lab_image_clahe[..., 1] + 128) / 255.0,
        (lab_image_clahe[..., 2] + 128) / 255.0
    ], axis=-1)
    
    return lab_image_clahe

def build_dataset(image_dir, batch_size=BATCH_SIZE):
    # Efficiently list image files without storing all paths in memory
    image_files = tf.data.Dataset.list_files(os.path.join(image_dir, "*.jpg"), shuffle=True)
    
    # Load and preprocess images
    dataset = image_files.map(preprocess_image, num_parallel_calls=AUTOTUNE)
    
    # Batch, shuffle, and prefetch for performance
    dataset = dataset.batch(batch_size).prefetch(AUTOTUNE)
    
    return dataset


In [None]:
def build_mobile_model(input_shape=(224, 224, 3), num_classes=5):
    # Use MobileNetV3Small with minimal memory footprint
    base_model = tf.keras.applications.MobileNetV3Small(
        input_shape=input_shape,
        include_top=False,
        weights=None
    )
    
    # Freeze base model to reduce memory usage
    base_model.trainable = False
    
    inputs = layers.Input(shape=input_shape)
    x = base_model(inputs)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(32, activation='relu')(x)  # Smaller dense layer
    outputs = layers.Dense(num_classes, activation='softmax')(x)
    
    model = Model(inputs, outputs)
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

In [None]:
def train_model():
    # Build datasets
    train_ds = build_dataset("/home/sala/data/general/train/")
    val_ds = build_dataset("/home/sala/data/general/test/")
    for img, label in train_ds.take(1):  
     print(f"Image shape: {img.shape}, Label: {label}")

    
    # Build model
    model = build_mobile_model()
    
    # Add early stopping to prevent overfitting/memory leaks
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True
    )
    
    # Train with reduced batch size and memory monitoring
    history = model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=20,
        callbacks=[early_stopping]
    )
    
    return model

In [None]:
from multiprocessing import Pool
import psutil  # For memory monitoring

def generate_pseudo_labels(unlabeled_dir, model):
    # Limit parallel workers to available CPU cores
    num_workers = min(4, psutil.cpu_count())
    
    # Process in chunks to avoid memory spikes
    image_paths = [os.path.join(unlabeled_dir, f) for f in os.listdir(unlabeled_dir)]
    chunks = [image_paths[i:i+100] for i in range(0, len(image_paths), 100)]
    
    for chunk in tqdm(chunks, desc="Generating Pseudo-Labels"):
        with Pool(num_workers) as pool:
            results = pool.map(preprocess_image, chunk)
        
        # Predict in batches
        pseudo_labels = model.predict(
            np.array(results),
            batch_size=8  # Smaller batch for prediction
        )
        
        # Save pseudo-labels to disk instead of keeping in memory
        for path, label in zip(chunk, pseudo_labels):
            np.save(f"pseudo_labels/{os.path.basename(path)}.npy", label)

In [None]:
def evaluate_model(model, test_dir):
    test_ds = build_dataset(test_dir, batch_size=8)
    
    # Evaluate in batches to prevent OOM
    results = []
    for batch in tqdm(test_ds, desc="Evaluating Model"):
        preds = model.predict(batch, verbose=0)
        results.extend(preds)
    
    # Calculate metrics
    accuracy = np.mean(np.argmax(results, axis=1) == ground_truth_labels)
    print(f"Test Accuracy: {accuracy*100:.2f}%")

In [None]:
if __name__ == "__main__":
    # Train the model
    model = train_model()
    
    # Generate pseudo-labels for unlabeled data
    generate_pseudo_labels("/home/sala/data/soil_data/test/Black Soil", model)
    
    # Evaluate the model
#  evaluate_model(model, "soil/test")