In [1]:
import os
import pandas as pd
import numpy as np
import cv2
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from warnings import filterwarnings
filterwarnings('ignore')

## CNN

In [2]:
# Global image directory path
IMG_DIR = "/Users/susanketsarkar/Desktop/Code/Meesho/data/train_images"  # Update with your image directory

def load_data(csv_path, attr_to_predict):
    # Load data
    df = pd.read_csv(csv_path)
    
    # Extract relevant columns
    df = df[['id', 'Category', attr_to_predict]]
    
    # Drop rows with missing values in the target attribute
    df.dropna(subset=[attr_to_predict], inplace=True)

    # Create image paths
    df['image_path'] = df['id'].apply(lambda x: os.path.join(IMG_DIR, f"{str(x).zfill(6)}.jpg"))
    
    return df

def preprocess_images(image_paths, target_size=(64, 64)):
    images = []
    for img_path in image_paths:
        if os.path.exists(img_path):
            img = cv2.imread(img_path)
            img = cv2.resize(img, target_size)
            images.append(img)
    return np.array(images)

def build_cnn_model(input_shape, num_classes):
    model = Sequential()
    model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def train_cnn_on_attribute(csv_path, attr_to_predict, epochs=10):
    # Load and prepare data
    df = load_data(csv_path, attr_to_predict)
    X = preprocess_images(df['image_path'].tolist())

    print(f"Training on {len(X)} data points...")
    
    # Encode labels
    le = LabelEncoder()
    y = le.fit_transform(df[attr_to_predict])
    y = to_categorical(y)
    
    # Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Build CNN model
    print("Building the model...")
    model = build_cnn_model(input_shape=(64, 64, 3), num_classes=y.shape[1])

    # Train the model
    model.fit(X_train, y_train, epochs=epochs, batch_size=4, validation_split=0.1, verbose=2)

    # Evaluate the model
    print(f"Generating the metrics...")
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y_test, axis=1)

    # Print metrics
    print("Classification Report:")
    print(classification_report(y_true, y_pred_classes, target_names=le.classes_))
    print(f"Accuracy: {accuracy_score(y_true, y_pred_classes) * 100:.2f}%")

In [4]:
csv_path = "../data/cat_wise_csv/Kurtis_data.csv"  # Replace with your actual CSV path
attr_to_predict = 'color'  # Change this to the desired attribute
train_cnn_on_attribute(csv_path, attr_to_predict, epochs=100)

Training on 6629 data points...
Building the model...
Epoch 1/100
