In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os

In [2]:
import logging
import os
from datetime import datetime
import json
import numpy as np
import cv2
from skimage.feature import hog
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, multilabel_confusion_matrix
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, hamming_loss

# Logger setup
logging.basicConfig(filename='model_training_log.txt', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

In [3]:
# Function to redirect print statements to log
class Logger(object):
    def __init__(self, log_file):
        self.terminal = sys.stdout
        self.log = open(log_file, 'a')
    def write(self, message):
        print(message)
        self.terminal.write(message)
        self.log.write(message)
    def flush(self):
        self.terminal.flush()
        self.log.flush()

In [4]:
CACHE_PATH = '/kaggle/input/cloth-image-parsed-datasets'

In [5]:
import logging
import os
from datetime import datetime

def create_logger(size, feature_extract):
    log_filename = f"logs/{size[0]}_{feature_extract.__name__}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
    os.makedirs(os.path.dirname(log_filename), exist_ok=True)
    logger = logging.getLogger(f"{feature_extract.__name__}_{size}")
    logger.setLevel(logging.INFO)
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(logging.INFO)
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    console_handler.setFormatter(formatter)
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
    return logger

In [6]:
!pip install scikit-multilearn-ng

Collecting scikit-multilearn-ng
  Downloading scikit_multilearn_ng-0.0.8-py3-none-any.whl.metadata (6.8 kB)
Collecting liac-arff>=2.2.1 (from scikit-multilearn-ng)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading scikit_multilearn_ng-0.0.8-py3-none-any.whl (109 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.6/109.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: liac-arff
  Building wheel for liac-arff (setup.py) ... [?25l[?25hdone
  Created wheel for liac-arff: filename=liac_arff-2.5.0-py3-none-any.whl size=11717 sha256=7ec6d0d58a2842e35efd98cbbb1c16bd2ff9eeefbb8cf5b0098063f92340c020
  Stored in directory: /root/.cache/pip/wheels/5d/2a/9c/3895d9617f8f49a0883ba686326d598e78a1c2f54fe3cae86d
Successfully built liac-arff
Installing collected packages: liac-arff, scikit-multilearn-ng
Successfully installed liac-arff-2.5.0 scikit-multilearn-ng-0.0.8

In [7]:
DIR = '/kaggle/input/8-labels-cloth-classification'
IMG_DIR = os.path.join(DIR, 'imgs')
TEST_PATH = os.path.join(DIR, 'test', 'data.json')
TRAIN_PATH = os.path.join(DIR, 'train', 'data.json')
VAL_PATH = os.path.join(DIR, 'val', 'data.json')
CLASS_PATH = os.path.join(DIR, 'classes.txt')

N_COMPONENTS = [0.2, 0.3, 0.5, 0.7]
K = [5, 11, 
     # 17
    ]
SIZES = [
    # 64, 
    128,
    # 224
]
RESIZES = [(size, size) for size in SIZES]
FRACTION = 1

labels = [
    "shirt, blouse",
    "top, t-shirt, sweatshirt",
    "jacket",
    "pants",
    "skirt",
    "dress",
    "shoe",
    "bag, wallet",
]

In [8]:
import random

RANDOM_INDEXS = {}

In [9]:
def extract_hog_features(image):
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    hog_features, _ = hog(
        gray_image,
        orientations=9,
        pixels_per_cell=(8, 8),
        cells_per_block=(2, 2),
        block_norm="L2-Hys",
        visualize=True,
    )
    return hog_features

In [10]:
def extract_edge_features(image):
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    sobel_x = cv2.Sobel(gray_image, cv2.CV_64F, 1, 0, ksize=3)  # Gradient in x direction
    sobel_y = cv2.Sobel(gray_image, cv2.CV_64F, 0, 1, ksize=3)  # Gradient in y direction
    sobel_edges = np.hypot(sobel_x, sobel_y)  # Compute the magnitude of gradients (edges)
    sobel_edges = np.uint8(np.absolute(sobel_edges))  # Convert to uint8 for display and further processing
    sobel_edges_flat = sobel_edges.flatten()
    return sobel_edges_flat

In [11]:
def extract_both_features(image):
    hog_features = extract_hog_features(image)
    edge_features = extract_edge_features(image)
    merged_features = np.concatenate((hog_features, edge_features))
    return merged_features

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from skmultilearn.adapt import MLkNN
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import NearestNeighbors

def create_pipeline(n_components, k) -> Pipeline:
    steps = []
    
    scaler = StandardScaler()
    steps.append(('scaler', scaler))

    pca = PCA(n_components)
    steps.append(('pca', pca))

    classifier = MLkNN(k=k)
    steps.append(('classifier', classifier))

    pipeline = Pipeline(steps)
    return pipeline

In [13]:
from sklearn.metrics import f1_score, classification_report, hamming_loss, accuracy_score, multilabel_confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_predictions(y_true, y_pred):
    report = classification_report(y_true, y_pred, target_names=labels, zero_division=0)
    hamming_loss_value = hamming_loss(y_true, y_pred)
    conf_matrices = multilabel_confusion_matrix(y_true, y_pred)

    # for i, label in enumerate(labels):
    #     plt.figure(figsize=(6, 4))
    #     sns.heatmap(conf_matrices[i], annot=True, fmt="d", cmap="Blues", cbar=False)
    #     plt.title(f"Confusion Matrix for Class {label}")
    #     plt.xlabel('Predicted')
    #     plt.ylabel('True')
    #     plt.tight_layout()
    #     plt.show()

    metrics = {
        "classification_report": report,
        "hamming_loss": hamming_loss_value,
        "confusion_matrices": conf_matrices
    }

    return metrics

In [14]:
import numpy as np
from scipy.sparse import issparse

def convert_to_dense(matrix):
    if issparse(matrix):
        dense_matrix = matrix.toarray()
    elif isinstance(matrix, np.ndarray):
        dense_matrix = matrix
    else:
        raise ValueError("Input must be a scipy.sparse matrix or numpy.ndarray.")
    
    return dense_matrix


In [15]:
def load_data_from_json(json_file, img_dir, feature_extract, target_size=(64, 64)):
    with open(json_file, "r") as f:
        data = json.load(f)

    length = len(data)
    if RANDOM_INDEXS.get(length) is None:
        RANDOM_INDEXS[length] = random.sample(list(range(length)), int(length * FRACTION))
        
    # Sample the data using the indices
    sample_data = [data[i] for i in RANDOM_INDEXS[length]]

    X = []
    y = []

    for item in tqdm(sample_data, desc=f"Processing data"):
        img_path = item["imgPath"]
        label = item["labels"]  # Multi-hot encoded labels

        # Construct image path
        image_path = os.path.join(img_dir, img_path)

        if os.path.exists(image_path):
            image = cv2.imread(image_path)
            image = cv2.resize(image, target_size)
            features = feature_extract(image)
            X.append(features)
            y.append(label)

    return np.array(X), np.array(y)

In [16]:
import joblib
import os
import numpy as np
from sklearn.model_selection import GridSearchCV

In [17]:
best_model_ = None
best_hamming_loss_ = None
best_k_ = None
best_n_components = None

In [18]:
CACHE = True
SAVED_DATA = True


def get_path_from_file_name(file_name: str):
    # print(f">> CACHE_PATH: {CACHE_PATH}")
    if CACHE_PATH is not None:
        file_path = os.path.join(CACHE_PATH, file_name)
        # print(f">> file_path: {file_path}")
        if os.path.exists(file_path):
            return file_path
    if os.path.exists(file_name):
        return file_name
    return None


def load_or_process_data(file_name, logger, size, feature_extract, data_type, load_function):
    # print(f'>> file_name: {file_name}')
    path = get_path_from_file_name(file_name)
    # print(f'>> path: {path}')
    if path is not None and CACHE:
        logger.info(f"Loading preprocessed {data_type} data from {path}...")
        data = np.load(path)
        X, y = data[f'X_{data_type}'], data[f'y_{data_type}']
        logger.info(f"Loaded X_{data_type} shape: {X.shape}, y_{data_type} shape: {y.shape}")
    else:
        logger.info(f"Feature extract: {feature_extract.__name__}")
        logger.info(f"Resize: {size}")
        logger.info(f"Loading {data_type} data...")
        X, y = load_function()
        X = X.astype('float32')
        logger.info(f"X_{data_type} shape: {X.shape}, y_{data_type} shape: {y.shape}")

        if SAVED_DATA:
            logger.info(f"Saving preprocessed {data_type} data to {file_name}...")
            np.savez_compressed(file_name, **{f'X_{data_type}': X, f'y_{data_type}': y})

    return X, y


def load_combined_features(hog_file_name, edge_file_name, logger, size, data_type):
    hog_file_path = get_path_from_file_name(hog_file_name)
    edge_file_path = get_path_from_file_name(edge_file_name)
    if hog_file_path is None or edge_file_path is None:
        return None, None

    logger.info(f"Loading combined {data_type} features from {hog_file_path} and {edge_file_path}...")
    data_hog = np.load(hog_file_path)
    data_edge = np.load(edge_file_path)
    X_hog, y_hog = data_hog[f'X_{data_type}'], data_hog[f'y_{data_type}']
    X_edge = data_edge[f'X_{data_type}']
    X = np.concatenate((X_hog, X_edge), axis=1)
    logger.info(f"Combined X_{data_type} shape: {X.shape}, y_{data_type} shape: {y_hog.shape}")
    return X, y_hog


def generate_file_name(size: tuple[int], feature_extract, data_type):
    file_name = f"data_{data_type}_{size[0]}_{feature_extract.__name__}_{FRACTION * 100}%.npz"
    return file_name

In [19]:
def process_data(size, feature_extract, data_type, logger):
    if feature_extract == extract_both_features:
        hog_file_name = generate_file_name(size, extract_hog_features, data_type)
        edge_file_name = generate_file_name(size, extract_edge_features, data_type)
        X, y = load_combined_features(hog_file_name, edge_file_name, logger, size, data_type)

        if X is None or y is None:
            combined_file_name = generate_file_name(size, extract_both_features, data_type)
            X, y = load_or_process_data(
                combined_file_name,
                logger,
                size,
                feature_extract,
                data_type,
                lambda: (
                    load_combined_data(data_type, feature_extract, size)
                )
            )
    else:
        file_name = generate_file_name(size, feature_extract, data_type)
        X, y = load_or_process_data(
            file_name,
            logger,
            size,
            feature_extract,
            data_type,
            lambda: (
                load_combined_data(data_type, feature_extract, size)
            )
        )
    return X, y


def load_combined_data(data_type, feature_extract, size):
    if data_type == "train":
        return (
            np.concatenate([
                load_data_from_json(TRAIN_PATH, IMG_DIR, feature_extract, size)[0],
                load_data_from_json(VAL_PATH, IMG_DIR, feature_extract, size)[0]
            ], axis=0),
            np.concatenate([
                load_data_from_json(TRAIN_PATH, IMG_DIR, feature_extract, size)[1],
                load_data_from_json(VAL_PATH, IMG_DIR, feature_extract, size)[1]
            ], axis=0)
        )
    return load_data_from_json(TEST_PATH, IMG_DIR, feature_extract, size)

In [20]:
def save_model(pipeline, filename):
    """Save the trained pipeline to a file."""
    joblib.dump(pipeline, filename)
    logger.info(f"Saved trained pipeline to {filename}")

In [21]:
def load_model(filename):
    """Load the trained pipeline from a file."""
    logger.info(f"Loading trained pipeline from {filename}...")
    return joblib.load(filename)

In [22]:
def evaluate_and_log_metrics(y_true, y_pred, phase, n_components, k):
    """Evaluate predictions and log the metrics."""
    logger.info(f"Evaluation {phase} data (n_components={n_components}, k={k}):")
    metrics = evaluate_predictions(y_true, y_pred)
    logger.info(f"Classification Report:\n{metrics['classification_report']}")
    logger.info(f"Hamming loss: {metrics['hamming_loss']}")
    logger.info(f"Confusion matrices:\n{metrics['confusion_matrices']}")
    return metrics

In [23]:
def train_and_save_pipeline(n_components, k, X_train, y_train, X_test, y_test):
    """Create, train, evaluate, and save the pipeline."""
    logger.info(f"Creating and training pipeline with n_components={n_components}, k={k}...")
    pipeline = create_pipeline(n_components, k)
    pipeline.fit(X_train, y_train)

    # Evaluate on training data
    y_train_pred = convert_to_dense(pipeline.predict(X_train))
    evaluate_and_log_metrics(y_train, y_train_pred, "train", n_components, k)

    # Evaluate on test data
    y_test_pred = convert_to_dense(pipeline.predict(X_test))
    metrics = evaluate_and_log_metrics(y_test, y_test_pred, "test", n_components, k)

    return pipeline, metrics['hamming_loss']
    

In [24]:
import gc

In [25]:
def process_models(K, N_COMPONENTS, X_train, y_train, X_test, y_test, feature_extract, size, FRACTION, CACHE_PATH, CACHE):
    """Process models for various configurations, optimized for memory usage."""
    best_model_info = {
        'model': None,
        'hamming_loss': float('-inf'),
        'n_components': None,
        'k': None
    }
    
    for n_components in N_COMPONENTS:
        for k in K:
            # Free memory before each iteration
            gc.collect()
            
            try:
                pipeline, hamming_loss = train_and_save_pipeline(
                    n_components, k, X_train, y_train, X_test, y_test
                )
                
                if hamming_loss > best_model_info['hamming_loss']:
                    # Delete previous best model if it exists
                    if best_model_info['model'] is not None:
                        del best_model_info['model']
                        gc.collect()
                    
                    best_model_info.update({
                        'model': pipeline,
                        'hamming_loss': hamming_loss,
                        'n_components': n_components,
                        'k': k
                    })
            except Exception as e:
                logger.error(f"Error processing model with n_components={n_components}, k={k}: {str(e)}")
                continue
            
    if best_model_info['model'] is None:
        logger.warning("No valid models were processed")
        return
        
    model_file_name = (f"models/model_{size[0]}_{feature_extract.__name__}_"
                      f"{best_model_info['n_components']}_{best_model_info['k']}_"
                      f"{FRACTION * 100}%.joblib")
    
    try:
        save_model(best_model_info['model'], model_file_name)
        logger.info(f"Successfully saved model: {model_file_name}")
    except Exception as e:
        logger.error(f"Failed to save model: {str(e)}")
    finally:
        del best_model_info
        gc.collect()

In [26]:
# Ensure the folder exists, creating it if necessary
os.makedirs('models', exist_ok=True)

In [27]:
# Main processing loop
for size in RESIZES:
    for feature_extract in [
        # extract_hog_features, extract_edge_features, 
        extract_both_features
    ]:
        logger = create_logger(size, feature_extract)
        X_train, y_train = process_data(size, feature_extract, "train", logger)
        X_test, y_test = process_data(size, feature_extract, "test", logger)

        process_models(K, N_COMPONENTS, X_train, y_train, X_test, y_test, feature_extract, size, FRACTION, CACHE_PATH, CACHE)


2025-01-02 15:20:49,836 - INFO - Loading combined train features from /kaggle/input/cloth-image-parsed-datasets/data_train_128_extract_hog_features_100%.npz and /kaggle/input/cloth-image-parsed-datasets/data_train_128_extract_edge_features_100%.npz...
2025-01-02 15:21:25,338 - INFO - Combined X_train shape: (39701, 24484), y_train shape: (39701, 8)
2025-01-02 15:21:25,353 - INFO - Loading combined test features from /kaggle/input/cloth-image-parsed-datasets/data_test_128_extract_hog_features_100%.npz and /kaggle/input/cloth-image-parsed-datasets/data_test_128_extract_edge_features_100%.npz...
2025-01-02 15:21:29,247 - INFO - Combined X_test shape: (4412, 24484), y_test shape: (4412, 8)
2025-01-02 15:21:29,354 - INFO - Creating and training pipeline with n_components=0.2, k=5...
2025-01-02 16:45:28,659 - INFO - Evaluation train data (n_components=0.2, k=5):
2025-01-02 16:45:28,829 - INFO - Classification Report:
                          precision    recall  f1-score   support

        