**Cloud-Optimized Image Feature Extraction with PCA:** 

I used Google Cloud to process and analyze large-scale image data while managing memory efficiently. Here’s how I approached the task:

* Feature Extraction:

Extracted features like Pixel Intensity, Color Histogram, HOG, and CNN Intermediate Layer from resized images (50x50).

* Incremental PCA:

Applied Incremental PCA to reduce dimensions in batches, ensuring memory efficiency. Tested different numbers of components to find the best configuration.

* Google Cloud Usage:

Leveraged cloud computing resources for processing and cloud storage for managing large datasets and intermediate results.

In [None]:
# --- Cloud-Optimized Image Feature Extraction with PCA ---
import os
import re
import gc
import pickle
import numpy as np
import cv2
from os.path import isfile, join
from tqdm import tqdm
from sklearn.decomposition import IncrementalPCA
import matplotlib
matplotlib.use('Agg')  # Use non-interactive backend for cloud
import matplotlib.pyplot as plt

# Memory management configuration
MAX_MEMORY_USAGE = 0.8  # Target max memory usage (80%)
BATCH_SIZE = 500  # Reduced from 1000 for cloud environments
RESIZE_DIM = (50, 50)  # Reduced image size
HIST_BINS = 32  # Reduced from 256 for color histograms

def get_memory_usage():
    """Get current memory usage percentage"""
    import psutil
    return psutil.virtual_memory().percent / 100

def safe_image_read(file_path):
    """Robust image reading with error handling"""
    try:
        img = cv2.imread(file_path)
        if img is None:
            raise ValueError(f"Could not read image: {file_path}")
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        return cv2.resize(img, RESIZE_DIM)
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

def extract_color_histogram(image, bins=HIST_BINS):
    """Memory-efficient color histogram"""
    # Process each channel separately to reduce peak memory
    features = []
    for i in range(3):
        hist = cv2.calcHist([image], [i], None, [bins], [0, 256])
        hist = cv2.normalize(hist, hist).flatten()
        features.append(hist)
        if get_memory_usage() > MAX_MEMORY_USAGE:
            gc.collect()
    return np.concatenate(features)

def extract_hog_features(image):
    """HOG feature extraction with memory check"""
    gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    return hog(gray, pixels_per_cell=(8, 8), cells_per_block=(2, 2), 
              feature_vector=True)

class CNNFE:
    """Lazy-loaded CNN feature extractor to reduce memory footprint"""
    _model = None
    
    @classmethod
    def extract(cls, image):
        if cls._model is None:
            from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
            from tensorflow.keras.models import Model
            base_model = VGG16(weights='imagenet', include_top=False, 
                             input_shape=(*RESIZE_DIM, 3))
            cls._model = Model(inputs=base_model.input, outputs=base_model.output)
        
        image = np.expand_dims(image, axis=0)
        return cls._model.predict(preprocess_input(image)).flatten()

def process_batch(batch_files, folder_path, feature_type):
    """Process a batch of files with memory management"""
    batch_features = []
    batch_labels = []
    
    for file in batch_files:
        file_path = join(folder_path, file)
        if not isfile(file_path):
            continue
            
        img = safe_image_read(file_path)
        if img is None:
            continue
            
        # Feature extraction
        if feature_type == 'pixel_intensity':
            features = img.flatten() / 255.0
        elif feature_type == 'color_histogram':
            features = extract_color_histogram(img)
        elif feature_type == 'hog':
            features = extract_hog_features(img)
        elif feature_type == 'cnn':
            features = CNNFE.extract(img)
        else:
            raise ValueError(f"Invalid feature type: {feature_type}")
            
        batch_features.append(features)
        
        # Label extraction
        match = re.search(r'([A-Za-z\s]+)(?=\s\d+$)', os.path.basename(folder_path))
        if match:
            batch_labels.append(match.group(1))
            
        # Memory management
        if get_memory_usage() > MAX_MEMORY_USAGE:
            gc.collect()
            
    return np.array(batch_features, dtype=np.float32), np.array(batch_labels)

def import_images_in_batches(path, feature_type):
    """Memory-efficient batch processing with caching"""
    cache_file = join("cache", f"features_{feature_type}.pkl")
    
    # Try loading from cache
    if isfile(cache_file):
        try:
            with open(cache_file, "rb") as f:
                return pickle.load(f)
        except:
            os.remove(cache_file)
    
    # Process folders in sorted order for consistency
    folders = sorted([f for f in os.listdir(path) if os.path.isdir(join(path, f))])
    all_features = []
    all_labels = []
    
    for folder in tqdm(folders, desc="Processing folders"):
        folder_path = join(path, folder)
        files = [f for f in os.listdir(folder_path) if isfile(join(folder_path, f))]
        
        # Process in smaller batches for memory efficiency
        for i in range(0, len(files), BATCH_SIZE // 5):
            batch_files = files[i:i + BATCH_SIZE // 5]
            features, labels = process_batch(batch_files, folder_path, feature_type)
            
            if len(features) > 0:
                all_features.append(features)
                all_labels.append(labels)
                
            if get_memory_usage() > MAX_MEMORY_USAGE:
                gc.collect()
    
    # Combine and save to cache
    all_features = np.vstack(all_features)
    all_labels = np.concatenate(all_labels)
    
    os.makedirs("cache", exist_ok=True)
    with open(cache_file, "wb") as f:
        pickle.dump((all_features, all_labels), f, protocol=4)  # Protocol 4 for cloud compatibility
        
    return all_features, all_labels

def apply_incremental_pca(features, labels, n_components=2):
    """Memory-efficient PCA with caching"""
    cache_file = join("cache", f"pca_{n_components}.pkl")
    
    if isfile(cache_file):
        with open(cache_file, "rb") as f:
            return pickle.load(f)
    
    ipca = IncrementalPCA(n_components=n_components, batch_size=BATCH_SIZE)
    
    # Fit in batches
    for i in tqdm(range(0, len(features), BATCH_SIZE), desc="Fitting PCA"):
        batch = features[i:i + BATCH_SIZE]
        ipca.partial_fit(batch)
        if get_memory_usage() > MAX_MEMORY_USAGE:
            gc.collect()
    
    # Transform in batches
    reduced = []
    for i in tqdm(range(0, len(features), BATCH_SIZE), desc="Transforming"):
        batch = features[i:i + BATCH_SIZE]
        reduced.append(ipca.transform(batch))
        if get_memory_usage() > MAX_MEMORY_USAGE:
            gc.collect()
    
    reduced = np.vstack(reduced)
    
    with open(cache_file, "wb") as f:
        pickle.dump((reduced, labels), f, protocol=4)
        
    return reduced, labels

def visualize_pca(reduced, labels, feature_type):
    """Save plot instead of showing for cloud environments"""
    plt.figure(figsize=(12, 10))
    unique_labels = np.unique(labels)
    cmap = plt.cm.get_cmap('rainbow', len(unique_labels))
    
    for i, label in enumerate(unique_labels):
        mask = labels == label
        plt.scatter(reduced[mask, 0], reduced[mask, 1], 
                   label=label, s=20, alpha=0.6, 
                   color=cmap(i / len(unique_labels)))
    
    plt.title(f'PCA - {feature_type}')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    
    os.makedirs("plots", exist_ok=True)
    plt.savefig(f"plots/pca_{feature_type}.png", dpi=150, bbox_inches='tight')
    plt.close()
    print(f"Saved plot to plots/pca_{feature_type}.png")

def main():
    path = 'fruits-360/Training'
    if not os.path.exists(path):
        raise FileNotFoundError(f"Dataset path not found: {path}")
    
    feature_types = ['pixel_intensity', 'color_histogram', 'hog', 'cnn']
    for i in range(2, 111):
        for feature_type in feature_types:
            print(f"\nProcessing {feature_type} features...")
            
            # Load or process features
            features, labels = import_images_in_batches(path, feature_type)
            
            # Apply PCA
            reduced, labels = apply_incremental_pca(features, labels,n_components=i)
            
            # Visualize and save results
            visualize_pca(reduced, labels, feature_type)
            
            # Explicit cleanup
            del features, reduced
            gc.collect()

if __name__ == "__main__":
    # Configure for cloud environments
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Reduce TensorFlow logging
    import warnings
    warnings.filterwarnings('ignore')
    
    try:
        main()
    except Exception as e:
        print(f"Error: {str(e)}")
        raise
    # this code will not work here as i have used it on cloud and obtained the results and used kmeans to create 440 models 
    # and saved them as model folder which i used to find best p_components by calculating silhouette score and storing in 
    # /home/lalit-mohan/model testing/clustering_results_20250411_223025.pkl

**Results for KMeans Clustering Evaluation:**

* Feature Extraction:

I extracted four feature types: Color Histogram, Pixel Intensity, HOG, and CNN Intermediate Layer.

* PCA for Dimensionality Reduction:

I applied PCA to reduce the feature dimensions and tested various numbers of components.

* KMeans Clustering:

I used KMeans to cluster the data for different values of k and calculated the Silhouette Score.

* Optimal k Selection:

I identified the best k for each feature by selecting the highest silhouette score.

* Results:

The optimal number of PCA components was 2 for all feature types.

* Conclusion:

The best-performing model was based on [best feature type], with k = [best k] and the highest silhouette score.



In [3]:
import pickle
import matplotlib.pyplot as plt

# Load pickle file
with open('/home/lalit-mohan/model testing/clustering_results_20250411_223025.pkl', 'rb') as f:
    results = pickle.load(f)

# Set up the plot
plt.figure(figsize=(12, 7))

# Define colors for models
colors = ['blue', 'green', 'orange', 'purple', 'brown', 'teal', 'gray']
markers = ['o', 's', 'D', '^', 'v', 'p', '*']

# Loop through each model and plot
for i, (model, model_results) in enumerate(results.items()):
    k_values = sorted(model_results.keys())
    silhouette_scores = [model_results[k]['silhouette'] for k in k_values]
    
    best_k = k_values[silhouette_scores.index(max(silhouette_scores))]
    best_score = max(silhouette_scores)

    # Plot line
    plt.plot(k_values, silhouette_scores, marker=markers[i % len(markers)],
             color=colors[i % len(colors)], label=f'{model.upper()}')

    # Highlight best point
    plt.scatter([best_k], [best_score], color=colors[i % len(colors)],
                edgecolor='black', s=100, zorder=5)

    # Add dashed line for best k
    plt.axvline(best_k, color=colors[i % len(colors)], linestyle='--', alpha=0.3)

# Plot settings
plt.title('Silhouette Score vs Number of Clusters for All Models')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.legend(title='Model')
plt.grid(True)
plt.tight_layout()
plt.show()

summary = {}

for model, model_results in results.items():
    k_values = sorted(model_results.keys())
    silhouette_scores = [model_results[k]['silhouette'] for k in k_values]

    max_score = max(silhouette_scores)
    best_k = k_values[silhouette_scores.index(max_score)]
    avg_score = np.mean(silhouette_scores)

    summary[model] = {
        'best_k': best_k,
        'max_score': max_score,
        'avg_score': avg_score
    }

# Print model-wise summary
print(f"{'Model':<10} | {'Best k':<6} | {'Max Score':<10} | {'Avg Score':<10}")
print("-" * 50)
for model, stats in summary.items():
    print(f"{model:<10} | {stats['best_k']:<6} | {stats['max_score']:<10.4f} | {stats['avg_score']:<10.4f}")

# Find overall best model
best_model = max(summary.items(), key=lambda x: x[1]['max_score'])
print("\n Overall Best Model:")
print(f"Model: {best_model[0]}")
print(f"Best k: {best_model[1]['best_k']}")
print(f"Silhouette Score: {best_model[1]['max_score']:.4f}")

Model      | Best k | Max Score  | Avg Score 
--------------------------------------------------
cnn        | 2      | 0.3327     | 0.1855    
pixel_intensity | 2      | 0.3572     | 0.1768    
hog        | 2      | 0.3310     | 0.1175    
color_histogram | 2      | 0.3592     | 0.3029    

 Overall Best Model:
Model: color_histogram
Best k: 2
Silhouette Score: 0.3592


I used the below given code to make kmeans model for all best result giving features

In [None]:
import os
import pickle
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Mapping of feature files to corresponding output files
feature_files = {
    "data/images_color_histogram_50x50.pkl": "robo_mod/color_histogram_kmeans.pkl",
    "data/images_hog_50x50.pkl": "robo_mod/hog_kmeans.pkl",
    "data/images_pixel_intensity_50x50.pkl": "robo_mod/pixel_intensity_kmeans.pkl"
}

# Number of clusters (modify as needed)
NUM_CLUSTERS = 140

# Ensure output directory exists
os.makedirs("robo_mod", exist_ok=True)

for feature_file, kmeans_file in feature_files.items():
    print(f"\nProcessing {feature_file} → {kmeans_file}")

    # Load the original feature data
    with open(feature_file, "rb") as f:
        features = pickle.load(f)

    # Check if data is a tuple and extract the first element (which should be the NumPy array)
    if isinstance(features, tuple):
        X = features[0]
        print(f"Data extracted from tuple, first element shape: {X.shape}")
    elif isinstance(features, np.ndarray):
        X = features
    else:
        print(f"{feature_file} does not contain a NumPy array or tuple.")
        continue

    # Apply PCA to reduce dimensionality to 2 components
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)

    # Fit KMeans on the PCA-reduced features
    kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=42)
    kmeans.fit(X_pca)

    # Save the KMeans model
    with open(kmeans_file, "wb") as f:
        pickle.dump(kmeans, f)

    print(f"Saved KMeans model at {kmeans_file} (n_clusters = {NUM_CLUSTERS})")


used this code to have some predictions

In [None]:
import os
import joblib
import pickle
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from skimage.io import imread_collection
from skimage.color import rgb2hsv
from tqdm import tqdm  # Progress bar

# Paths and settings
model_path = 'robo_mod/color_histogram_kmeans.pkl'
dataset_path = "/home/lalit-mohan/fruits-360/Training"
feature_cache_path = 'robo_mod/color_histogram_features.pkl'
bins = 8

# Custom function to compute histogram for HSV image
def compute_histogram(image, bins=8):
    hist = []
    for channel in range(image.shape[2]):
        channel_hist, _ = np.histogram(image[:, :, channel], bins=bins, range=(0, 1), density=True)
        hist.extend(channel_hist)
    return np.array(hist)

# Extract color histogram from HSV image
def extract_color_histogram(image, bins=8):
    image = image / 255.0
    hsv_image = rgb2hsv(image)
    return compute_histogram(hsv_image, bins)

# Load KMeans model
if not os.path.exists(model_path):
    print("Model file does not exist:", model_path)
    exit()

try:
    model = joblib.load(model_path)
    if not isinstance(model, KMeans):
        print("Loaded model is not a KMeans instance.")
        exit()
    print("KMeans model loaded successfully.")
except Exception as e:
    print(f"Error loading the model: {e}")
    exit()

# Load images from dataset
images = imread_collection(f"{dataset_path}/*/*.jpg")
print(f"{len(images)} images loaded.")

# Load or compute features
if os.path.exists(feature_cache_path):
    print("Loading features from cache...")
    with open(feature_cache_path, 'rb') as f:
        features = pickle.load(f)
else:
    print("Extracting features...")
    features = [extract_color_histogram(img, bins) for img in tqdm(images, desc="Extracting features")]
    features = np.array(features).astype(np.float64)  # Ensure dtype is float64

    print("Saving features to cache...")
    with open(feature_cache_path, 'wb') as f:
        pickle.dump(features, f)

# Apply PCA
print("Reducing dimensions with PCA...")
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(features).astype(np.float32)  # Ensure dtype is float64

# Predict with KMeans
print("Predicting cluster labels...")
print("Data type of reduced_features before prediction:", reduced_features.dtype)
predictions = model.predict(reduced_features)

# Output sample results
print("Predictions complete. Sample output:", predictions[:1000])

# Get the image paths of the first 10 images
image_paths = images.files[:1000]  # This is the list of file paths

# Extract image names and parent directories from file paths
image_info = [(os.path.basename(img_path), os.path.dirname(img_path)) for img_path in image_paths]

# Map image names and parent directories to predicted clusters
image_cluster_mapping = {
    (image_info[i][0], image_info[i][1]): predictions[i] for i in range(1000)
}

# Display the mapping of image names and parent directories to clusters
print("Image Name, Parent Directory to Predicted Cluster Mapping:")
for (image_name, parent_dir), cluster in image_cluster_mapping.items():
    print(f"Image: {image_name} (Parent Directory: {parent_dir}) => Predicted Cluster: {cluster}")
