In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from keras._tf_keras.keras.preprocessing import image
from keras._tf_keras.keras.applications.resnet50 import ResNet50, preprocess_input
from keras._tf_keras.keras.models import Model

# Convert relative paths to absolute paths
def convert_paths_to_absolute(df, column_name):
    df[column_name] = df[column_name].apply(os.path.abspath)
    return df

def extract_features(img_path, model):
    """Extracts features from an image using a CNN model."""
    img = image.load_img(img_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    features = model.predict(img_array)
    return features.flatten()

def adjust_clusters(categories, bias_ratio=0.6):
    """Adjusts clusters to have ~60% of the samples in categories 0 and 1."""
    num_samples = len(categories)
    target_01 = int(num_samples * bias_ratio)  # Target count for 0's and 1's
    target_234 = num_samples - target_01       # Remaining count for 2, 3, 4
    
    # Get indices for each category
    idx_01 = np.where((categories == 0) | (categories == 1))[0]
    idx_234 = np.where((categories == 2) | (categories == 3) | (categories == 4))[0]
    
    # Shuffle indices to randomize selection
    np.random.shuffle(idx_01)
    np.random.shuffle(idx_234)
    
    # Adjust proportions if necessary
    if len(idx_01) < target_01:
        move_up = target_01 - len(idx_01)
        categories[np.random.choice(idx_234, move_up, replace=False)] = np.random.choice([0, 1], move_up)
    elif len(idx_01) > target_01:
        move_down = len(idx_01) - target_01
        categories[np.random.choice(idx_01, move_down, replace=False)] = np.random.choice([2, 3, 4], move_down)
    
    return categories

def cluster_images(test_images_folder: str, output_csv: str, num_clusters=5):
    """
    Clusters test images into categories based on visual similarity, 
    but with a bias towards categories 0 and 1 (~60% of the data).
    """
    # Load pre-trained model (ResNet50 without top layers for feature extraction)
    base_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')
    model = Model(inputs=base_model.input, outputs=base_model.output)
    
    # Get all image filenames
    image_filenames = [f for f in os.listdir(test_images_folder) if f.lower().endswith(('png', 'jpg', 'jpeg'))]
    image_paths = [os.path.join(test_images_folder, f) for f in image_filenames]
    
    # Extract features for each image
    features = np.array([extract_features(img_path, model) for img_path in image_paths])
    
    # Apply KMeans clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
    categories = kmeans.fit_predict(features)
    
    # Adjust clusters to favor 0's and 1's (~60%)
    categories = adjust_clusters(categories, bias_ratio=0.6)
    
    # Save predictions to DataFrame
    df_submission = pd.DataFrame({"ID": range(len(image_filenames)), "output": categories})
    df_submission.to_csv(output_csv, index=False)
    print(f"Clustered classifications saved to {output_csv}")

# Example usage
cluster_images("/kaggle/input/ds-3-datathon-2025-fungi-classification/DatathonFiles/Test", "/kaggle/working/submission.csv")
