In [1]:
import os
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.manifold import TSNE
import pandas as pd
from tqdm import tqdm


# Define constants
IMAGE_SIZE = (224, 224)  # Input size for ResNet
NUM_CLUSTERS = 5  # Change based on the number of trends you want to find
SEASON_TUPLE = (
    'Spring 2020', 'Summer 2020', 'Fall 2020', 'Autumn 2020', 'Winter 2020',
    'Spring 2021', 'Summer 2021', 'Fall 2021', 'Autumn 2021', 'Winter 2021',
    'Spring 2022', 'Summer 2022', 'Fall 2022', 'Autumn 2022', 'Winter 2022',
    'Spring 2023', 'Summer 2023', 'Fall 2023', 'Autumn 2023', 'Winter 2023',
    'Spring 2024', 'Summer 2024', 'Fall 2024', 'Autumn 2024', 'Winter 2024',
)

# Function to load and preprocess images
def load_images_from_folder(folder_path):
    images = []
    filenames = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.lower().endswith(('png', 'jpg', 'jpeg')):
                file_path = os.path.join(root, file)
                try:
                    img = load_img(file_path, target_size=IMAGE_SIZE)
                    img_array = img_to_array(img)
                    img_array = preprocess_input(img_array)
                    images.append(img_array)
                    filenames.append(file_path)
                except Exception as e:
                    print(f"Error loading image {file_path}: {e}")
    if not images:
        print(f"No valid images found in folder: {folder_path}")
    return np.array(images), filenames

# Function to extract features using ResNet50
def extract_features(images):
    if images.size == 0:
        raise ValueError("The input image array is empty. Check the folder for valid image files.")
    
    model = ResNet50(weights='imagenet', include_top=False, pooling='avg')
    features = model.predict(images, batch_size=32, verbose=1)
    return features


# Function to label images with keywords
def generate_image_labels(images):
    model = ResNet50(weights='imagenet')
    labels = []
    for img_array in tqdm(images, desc="Labeling images"):
        predictions = model.predict(np.expand_dims(img_array, axis=0))
        decoded_predictions = decode_predictions(predictions, top=5)
        keywords = [pred[1] for pred in decoded_predictions[0]]  # Get top 5 keywords
        labels.append(keywords)
    return labels

# Function to perform clustering
def cluster_features(features, num_clusters):
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    labels = kmeans.fit_predict(features)
    return labels, kmeans

# Function to visualize clusters
def visualize_clusters(features, labels, filenames):
    pca = PCA(n_components=50)
    reduced_features = pca.fit_transform(features)
    tsne = TSNE(n_components=2, random_state=42)
    tsne_features = tsne.fit_transform(reduced_features)

    plt.figure(figsize=(10, 8))
    for label in np.unique(labels):
        indices = np.where(labels == label)
        plt.scatter(tsne_features[indices, 0], tsne_features[indices, 1], label=f'Cluster {label}')

    plt.legend()
    plt.title('Clusters of Dresses')
    plt.show()

    # Save images grouped by cluster
    for label in np.unique(labels):
        cluster_folder = f'cluster_{label}'
        os.makedirs(cluster_folder, exist_ok=True)
        for idx in np.where(labels == label)[0]:
            src_path = filenames[idx]
            dst_path = os.path.join(cluster_folder, os.path.basename(src_path))
            os.link(src_path, dst_path)

# Function to process each season separately
def process_season_images(folder_path):
    results = []
    for season in tqdm(SEASON_TUPLE, desc="Processing seasons"):
        season_folder = os.path.join(folder_path, season)
        if os.path.exists(season_folder):
            print(f"\nProcessing season: {season}")
            images, filenames = load_images_from_folder(season_folder)

            print("Extracting features...")
            features = extract_features(images)

            print("Generating labels for images...")
            labels = generate_image_labels(images)

            print("Clustering features...")
            cluster_labels, kmeans = cluster_features(features, NUM_CLUSTERS)

            # Compile results
            for i, filename in enumerate(filenames):
                results.append({
                    'Season': season,
                    'Filename': filename,
                    'Cluster': cluster_labels[i],
                    'Keywords': ", ".join(labels[i])
                })

            print(f"Done processing season: {season}!")

    # Save results to a CSV file
    df = pd.DataFrame(results)
    df.to_csv('seasonal_dress_clusters.csv', index=False)
    print("Results saved to 'seasonal_dress_clusters.csv'.")

# Main script
if __name__ == "__main__":
    folder_path = "images1"
    process_season_images(folder_path)


Processing seasons:   0%|          | 0/25 [00:00<?, ?it/s]


Processing season: Spring 2020
Extracting features...


Processing seasons:   0%|          | 0/25 [00:02<?, ?it/s]


ValueError: Size must be positive (size must be positive)