In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
import numpy as np
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torchvision import transforms

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [48]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import pandas as pd
import numpy as np
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Step 1Ô∏è‚É£: Split CSV ‡πÄ‡∏õ‡πá‡∏ô train/test
df = pd.read_csv("train.csv")
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.to_csv("train_split.csv", index=False)
test_df.to_csv("test_split.csv", index=False)

# Step 2Ô∏è‚É£: Prepare encoders
color_encoder = LabelEncoder()
type_encoder = LabelEncoder()
train_df['color'] = color_encoder.fit_transform(train_df['color'])
train_df['type'] = type_encoder.fit_transform(train_df['type'])
train_df.to_csv("train_split_encoded.csv", index=False)

test_df['color'] = color_encoder.transform(test_df['color'])
test_df['type'] = type_encoder.transform(test_df['type'])
test_df.to_csv("test_split_encoded.csv", index=False)

# Step 3Ô∏è‚É£: Transforms
transform_image = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

transform_mel_raw = transforms.Compose([
    transforms.Resize((128, 431)),
    transforms.ToTensor()  # Grayscale ‚Üí Tensor [1,H,W]
])

def transform_mel(mel_tensor, target_shape=(128, 431)):
    mel_tensor = mel_tensor.unsqueeze(0)  # (1, 1, H, W)
    mel_tensor = F.interpolate(
        mel_tensor, size=target_shape, mode='bilinear', align_corners=False
    ).squeeze(0)  # (1, target_H, target_W)
    mel_tensor = (mel_tensor - mel_tensor.min()) / (mel_tensor.max() - mel_tensor.min() + 1e-5)
    return mel_tensor

# Step 4Ô∏è‚É£: Dataset Class
class FashionMultiModalDataset(Dataset):
    def __init__(self, csv_path, image_dir, mel_dir):
        self.df = pd.read_csv(csv_path)
        self.image_dir = image_dir
        self.mel_dir = mel_dir

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        file_name = row['image_name']

        # Load Image
        image_path = f"{self.image_dir}/{file_name}"
        image = Image.open(image_path).convert("RGB")
        image = transform_image(image)

        # Load Mel-Spectrogram
        mel_key = file_name.replace('.jpg', '')  # Remove .jpg if needed
        mel_path = f"{self.mel_dir}/{mel_key}.png"
        mel_img = Image.open(mel_path).convert("L")
        mel_tensor = transform_mel_raw(mel_img)
        mel_tensor = transform_mel(mel_tensor)

        # Labels
        label_color = torch.tensor(row['color'], dtype=torch.long)
        label_type  = torch.tensor(row['type'], dtype=torch.long)

        label_condition = torch.tensor(int(row['condition']) - 1, dtype=torch.long)  # ‚úÖ ‡πÅ‡∏õ‡∏•‡∏á‡πÄ‡∏õ‡πá‡∏ô class index 0‚Äì4
        label_pilling   = torch.tensor(int(row['pilling']) - 1, dtype=torch.long)    # ‚úÖ ‡πÄ‡∏ä‡πà‡∏ô‡πÄ‡∏î‡∏µ‡∏¢‡∏ß‡∏Å‡∏±‡∏ô
        label_smell     = torch.tensor(int(row['smell']), dtype=torch.float32)       # ‚úÖ ‡πÉ‡∏ä‡πâ BCEWithLogitsLoss

        return {
            "image": image,
            "mel": mel_tensor,
            "color": label_color,
            "type": label_type,
            "condition": label_condition,
            "pilling": label_pilling,
            "smell": label_smell
        }

# Step 5Ô∏è‚É£: DataLoaders
train_dataset = FashionMultiModalDataset("train_split_encoded.csv", "./image", "./mel")
test_dataset = FashionMultiModalDataset("test_split_encoded.csv", "./image", "./mel")

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [49]:

import torch.nn as nn


class MultiModalModel(nn.Module):
    def __init__(self, num_colors, num_types):
        super().__init__()

        # üñºÔ∏è Image Encoder (‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÄ‡∏≠‡∏á)
        self.image_encoder = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),  # [B,3,224,224] ‚Üí [B,32,224,224]
            nn.ReLU(),
            nn.MaxPool2d(2),                             # [B,32,112,112]
            nn.Conv2d(32, 64, kernel_size=3, padding=1), # [B,64,112,112]
            nn.ReLU(),
            nn.MaxPool2d(2),                             # [B,64,56,56]
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1,1)),                 # [B,128,1,1]
            nn.Flatten(),                                # [B,128]
        )

        # üîä Audio Encoder (mel: [B,1,128,431])
        self.audio_encoder = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),                             # [B,16,64,215]
            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),                             # [B,32,32,107]
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1,1)),                 # [B,64,1,1]
            nn.Flatten(),                                # [B,64]
        )

        # üéØ Classifiers
        self.classifier_color = nn.Linear(128, num_colors)
        self.classifier_type = nn.Linear(128, num_types)
        self.classifier_condition = nn.Linear(64, 5)
        self.classifier_pilling   = nn.Linear(64, 5)
        self.classifier_smell = nn.Linear(64, 1)

    def forward(self, image, mel):
        img_feat = self.image_encoder(image)    # [B,128]
        mel_feat = self.audio_encoder(mel)      # [B,64]

        pred_color = self.classifier_color(img_feat)
        pred_type = self.classifier_type(img_feat)

        pred_condition = self.classifier_condition(mel_feat)
        pred_pilling = self.classifier_pilling(mel_feat)
        pred_smell = self.classifier_smell(mel_feat)

        return pred_color, pred_type, pred_condition, pred_pilling, pred_smell

In [50]:
def train_model(model, train_loader, test_loader, num_epochs=10, lr=1e-3, device='cuda'):
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn_ce = nn.CrossEntropyLoss()
    loss_fn_bce = nn.BCEWithLogitsLoss()

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        acc_color, acc_type = 0, 0

        for batch in train_loader:
            image = batch['image'].to(device)
            mel = batch['mel'].to(device)

            # üéØ Targets
            color = batch['color'].to(device)
            type_ = batch['type'].to(device)
            condition = batch['condition'].to(device)       # dtype: long ‡∏≠‡∏¢‡∏π‡πà‡πÅ‡∏•‡πâ‡∏ß
            pilling   = batch['pilling'].to(device)
            smell     = batch['smell'].to(device).float()  # ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö BCE

            # üöÄ Forward
            pred_color, pred_type, pred_condition, pred_pilling, pred_smell = model(image, mel)

            # üéØ Loss ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡πÅ‡∏ï‡πà‡∏•‡∏∞ head
            loss_color = loss_fn_ce(pred_color, color)
            loss_type  = loss_fn_ce(pred_type, type_)
            loss_condition = loss_fn_ce(pred_condition, condition)
            loss_pilling   = loss_fn_ce(pred_pilling, pilling)
            loss_smell     = loss_fn_bce(pred_smell.squeeze(), smell)

            # ‚úÖ ‡∏£‡∏ß‡∏°‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î
            loss = loss_color + loss_type + loss_condition + loss_pilling + loss_smell

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            acc_color += (pred_color.argmax(1) == color).float().mean().item()
            acc_type  += (pred_type.argmax(1) == type_).float().mean().item()

        avg_loss = total_loss / len(train_loader)
        avg_acc_color = acc_color / len(train_loader)
        avg_acc_type  = acc_type / len(train_loader)

        print(f"Epoch {epoch+1}/{num_epochs} | Loss: {avg_loss:.4f} | Color Acc: {avg_acc_color:.3f} | Type Acc: {avg_acc_type:.3f}")

    print("‚úÖ Training complete.")
    return model

In [51]:
# üîß ‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏Ñ‡∏•‡∏≤‡∏™
num_colors = len(color_encoder.classes_)
num_types = len(type_encoder.classes_)

# üéØ ‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÇ‡∏°‡πÄ‡∏î‡∏•
model = MultiModalModel(num_colors=num_colors, num_types=num_types)

# üöÄ ‡πÄ‡∏ó‡∏£‡∏ô!
trained_model = train_model(model, train_loader, test_loader, num_epochs=1, lr=1e-3, device='cpu')

# üíæ ‡πÄ‡∏ã‡∏ü‡πÇ‡∏°‡πÄ‡∏î‡∏•
torch.save(trained_model.state_dict(), "multimodal_model.pth")
print("‚úÖ ‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏ñ‡∏π‡∏Å‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÄ‡∏£‡∏µ‡∏¢‡∏ö‡∏£‡πâ‡∏≠‡∏¢‡∏ó‡∏µ‡πà multimodal_model.pth")

: 