In [None]:
import numpy as np
import pandas as pd
import cv2
import os
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import AdaBoostClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression  # For meta-classifier
from sklearn.cluster import KMeans

# Load train and test sets
df_train = pd.read_csv('/Users/prakhar/Desktop/Prediction_meesho/visual-taxonomy/train.csv')
df_test = pd.read_csv('/Users/prakhar/Desktop/Prediction_meesho/visual-taxonomy/test.csv')
train_image_dir = '/Users/prakhar/Desktop/Prediction_meesho/visual-taxonomy/train_images'
test_image_dir = '/Users/prakhar/Desktop/Prediction_meesho/visual-taxonomy/test_images'

# Define attribute configurations for each category
categories_attributes = {
    'Women Tops & Tunics': ['attr_1'],
}

# Predefined labels for 'attr_1'
predefined_labels = [
    'black', 'navy blue', 'red', 'default', 'maroon', 'white', 
    'green', 'blue', 'pink', 'yellow', 'peach', 'multicolor'
]


# Dictionary to store encoders for each category and attribute
label_encoders = {}

# Function to perform color quantization using K-Means
def extract_dominant_colors(image_path, k=3):
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = cv2.resize(image, (64, 64))
    pixels = image.reshape(-1, 3)
    
    # Apply K-Means clustering with explicit n_init value
    kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
    kmeans.fit(pixels)
    
    # Get dominant colors (centroids) and their frequencies
    colors = kmeans.cluster_centers_
    labels, counts = np.unique(kmeans.labels_, return_counts=True)
    freqs = counts / sum(counts)
    
    # Flatten the color and frequency data into one feature vector
    features = np.concatenate([colors.flatten(), freqs])
    return features

# Training process for each category and attribute
for category, attributes in categories_attributes.items():
    df_category = df_train[df_train['Category'] == category]
    df_category['id'] = df_category['id'].astype(str)
    df_category['filename'] = df_category['id'].apply(lambda x: x.zfill(6) + '.jpg')

    label_encoders[category] = {}

    # Iterate over each attribute
    for attr in attributes:
        print(f"Training for category: {category}, attribute: {attr}")
        
        # Drop columns not relevant to the current attribute and handle NaNs
        df_attr = df_category[['filename', attr]].dropna()
        
        # Manually set predefined labels for the LabelEncoder
        le = LabelEncoder()
        le.classes_ = np.array(predefined_labels + [np.nan])  # Include NaN as a category if needed
        df_attr['label'] = le.transform(df_attr[attr].fillna('default'))
        label_encoders[category][attr] = le

        # Extract features for each image
        df_attr['features'] = df_attr['filename'].apply(lambda x: extract_dominant_colors(os.path.join(train_image_dir, x)))

        # Set up K-Fold Cross-Validation
        kf = KFold(n_splits=3, shuffle=True, random_state=42)
        best_accuracy_across_folds = 0

        # Loop for each fold
        for fold, (train_idx, val_idx) in enumerate(kf.split(df_attr)):
            print(f"Training fold {fold+1} for {category} - {attr}")
            train_fold = df_attr.iloc[train_idx]
            val_fold = df_attr.iloc[val_idx]

            # Prepare training and validation data
            X_train = np.stack(train_fold['features'].values)
            y_train = train_fold['label'].values
            X_val = np.stack(val_fold['features'].values)
            y_val = val_fold['label'].values
            
            # Define individual models
            xgboost_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
            adaboost_model = AdaBoostClassifier(n_estimators=50, random_state=42)
            
            # Stacking with Logistic Regression as meta-classifier
            stack_model = StackingClassifier(
                estimators=[('xgboost', xgboost_model), ('adaboost', adaboost_model)],
                final_estimator=LogisticRegression(),
                cv=3
            )

            # Train the stacked model
            stack_model.fit(X_train, y_train)
            
            # Validate the model
            val_preds = stack_model.predict(X_val)
            val_accuracy = accuracy_score(y_val, val_preds)
            val_f1 = f1_score(y_val, val_preds, average='weighted')
            print(f"Validation accuracy for fold {fold+1}: {val_accuracy}")
            print(f"Validation F1 score for fold {fold+1}: {val_f1}")

            # Check if this fold's accuracy is the best so far
            if val_accuracy > best_accuracy_across_folds:
                best_accuracy_across_folds = val_accuracy
                best_f1_score_across_folds = val_f1
                best_stack_model = stack_model

        print(f"Best model across all folds for {category} - {attr} with accuracy: {best_accuracy_across_folds} and F1 score: {best_f1_score_across_folds}")

# ---- Prediction on Test Set ----
all_predictions = []

for category, attributes in categories_attributes.items():
    df_test_category = df_test[df_test['Category'] == category]
    df_test_category['id'] = df_test_category['id'].astype(str)
    df_test_category['filename'] = df_test_category['id'].apply(lambda x: x.zfill(6) + '.jpg')
    
    for attr in attributes:
        print(f"Predicting on test set for category: {category}, attribute: {attr}")
        
        # Extract features for the test set
        df_test_category['features'] = df_test_category['filename'].apply(lambda x: extract_dominant_colors(os.path.join(test_image_dir, x)))
        X_test = np.stack(df_test_category['features'].values)
        
        # Make predictions
        test_preds = best_stack_model.predict(X_test)
        
        # Convert labels back to original categories
        test_preds_decoded = label_encoders[category][attr].inverse_transform(test_preds)
        
        # Append predictions to main dataframe
        df_test_category[f'predicted_{attr}'] = test_preds_decoded
        all_predictions.append(df_test_category[['id', f'predicted_{attr}']])

# Concatenate all predictions and save to a CSV
df_predictions = pd.concat(all_predictions)
df_predictions.to_csv('tops_quantisation.csv', index=False)

print("Test predictions saved to 'tops_quantisation.csv'")
