# I. CÀI ĐẶT / IMPORT THƯ VIỆN / LOAD DATA


In [None]:
# !pip install datasets accelerate evaluate torchvision

In [None]:
import psutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms, models

from datasets import load_dataset, DatasetDict
from datasets import Features, Value, Image


In [None]:
# Kiểm tra RAM
available_ram = psutil.virtual_memory().available
cpu_count = psutil.cpu_count(logical=True)
print(f"Available RAM: {available_ram / (1024**3):.2f} GB")
print(f"CPU count: {cpu_count}")

Available RAM: 9.20 GB
CPU count: 2


## 1.1 Tải dataset từ Hugging Face

In [None]:
dataset_dict  = load_dataset("ashraq/fashion-product-images-small")
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['id', 'gender', 'masterCategory', 'subCategory', 'articleType', 'baseColour', 'season', 'year', 'usage', 'productDisplayName', 'image'],
        num_rows: 44072
    })
})


In [None]:
# Dữ liệu gốc: dataset_dict["train"]
train_dataset_org = dataset_dict["train"]

In [None]:
# Xem thử 1 record
print(train_dataset_org[0])

{'id': 15970, 'gender': 'Men', 'masterCategory': 'Apparel', 'subCategory': 'Topwear', 'articleType': 'Shirts', 'baseColour': 'Navy Blue', 'season': 'Fall', 'year': 2011.0, 'usage': 'Casual', 'productDisplayName': 'Turtle Check Men Navy Blue Shirt', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=60x80 at 0x79995303C890>}


# II. TIỀN XỬ LÝ & CHUẨN BỊ DỮ LIỆU

## 2.1 Thêm cột giá (price) minh hoạ

In [None]:
def assign_price(masterCategory, subCategory, articleType, year, season, usage):
    """Giả lập sinh giá tiền dựa trên masterCategory/subCategory/..."""
    import numpy as np
    master_price_range = {
        'Apparel': (10, 20), 'Accessories': (5, 100), 'Footwear': (20, 200),
        'Personal Care': (5, 300), 'Free Items': (0, 5), 'Sporting Goods': (100, 250), 'Home': (100, 300)
    }
    sub_price_adjustment = {
        'Topwear': (5, 50), 'Watches': (30, 150), 'Shoes': (20, 100),
        'Bags': (10, 70), 'Fragrance': (5, 80), 'Jewellery': (10, 200)
    }
    article_price_range = {
        'Shirts': (15, 70), 'Watches': (50, 300), 'Sports Shoes': (40, 200),
        'Sunglasses': (30, 150), 'Perfumes': (25, 120)
    }
    # Lấy khoảng giá từ masterCategory
    base_min, base_max = master_price_range.get(masterCategory, (10, 100))
    # Điều chỉnh theo subCategory
    adj_min, adj_max = sub_price_adjustment.get(subCategory, (0, 0))
    # Điều chỉnh theo articleType
    art_min, art_max = article_price_range.get(articleType, (base_min + adj_min, base_max + adj_max))
    # Điều chỉnh năm
    if year < 2015:
        discount = 0.3
    elif year < 2018:
        discount = 0.15
    else:
        discount = 0
    final_min = art_min * (1 - discount)
    final_max = art_max * (1 - discount)
    return round(np.random.uniform(final_min, final_max) * 2500, 0)

In [None]:
def add_price_column(example):
    example["price_in_vnd"] = assign_price(
        example["masterCategory"],
        example["subCategory"],
        example["articleType"],
        example["year"],
        example["season"],
        example["usage"]
    )
    return example

In [None]:
# Loại bỏ record null
def has_null(example):
    return any(val is None for val in example.values())

missing_null = dataset_dict["train"].filter(has_null)
print(f"Số record có giá trị null: {len(missing_null)}")

Số record có giá trị null: 0


In [None]:
# Mapped dataset
dataset_dict["train"] = dataset_dict["train"].map(add_price_column)

In [None]:
# Tách train/val/test (10% test, 10% val)
train_test_split = dataset_dict['train'].train_test_split(test_size=0.1, seed=42)
train_val_split = train_test_split['train'].train_test_split(test_size=0.1, seed=42)

In [None]:
final_dataset = DatasetDict({
    'train': train_val_split['train'],
    'val': train_val_split['test'],
    'test': train_test_split['test']
})

In [None]:
train_dataset = final_dataset["train"]
val_dataset   = final_dataset["val"]
test_dataset  = final_dataset["test"]

In [None]:
# Định nghĩa features (để cast)
features = Features({
    'id': Value('int64'),
    'gender': Value('string'),
    'masterCategory': Value('string'),
    'subCategory': Value('string'),
    'articleType': Value('string'),
    'baseColour': Value('string'),
    'season': Value('string'),
    'year': Value('int64'),
    'usage': Value('string'),
    'productDisplayName': Value('string'),
    'image': Image(),
    'price_in_vnd': Value('int64')
})

In [None]:
train_dataset = train_dataset.cast(features)
val_dataset   = val_dataset.cast(features)
test_dataset  = test_dataset.cast(features)

## 2.2 Tạo cột brand (chỉ số hoá) từ 'productDisplayName' + 'gender'

In [None]:
def extract_brand(name, gender):
    # Ví dụ: tách brand bằng cách lấy chuỗi trước từ "gender"
    name_lower   = name.lower()
    gender_lower = gender.lower()
    tokens       = name.split(" ")
    tokens_lower = name_lower.split(" ")
    if gender_lower in tokens_lower:
        idx = tokens_lower.index(gender_lower)
        brand_tokens = tokens[:idx]
        brand = " ".join(brand_tokens).replace("by","").strip()
        return brand
    else:
        return tokens[0]


In [None]:
def build_brand_mapping(dataset):
    brands = []
    for name, gender in zip(dataset["productDisplayName"], dataset["gender"]):
        brands.append(extract_brand(name, gender))
    unique_brands = sorted(list(set(brands)))
    mapping = {brand: idx for idx, brand in enumerate(unique_brands)}
    inv_mapping = {idx: brand for brand, idx in mapping.items()}
    return mapping, inv_mapping

In [None]:
def build_mapping(dataset, col):
    unique_labels = dataset.unique(col)
    unique_labels.sort()
    mapping = {label: idx for idx, label in enumerate(unique_labels)}
    inv_mapping = {idx: label for label, idx in mapping.items()}
    return mapping, inv_mapping

In [None]:
# Sinh mapping dựa trên FULL dataset gốc (train_dataset_org) hoặc tuỳ
gender_map, inv_gender_map           = build_mapping(train_dataset_org, "gender")
master_category_map, inv_master_map  = build_mapping(train_dataset_org, "masterCategory")
sub_category_map, inv_sub_cat_map    = build_mapping(train_dataset_org, "subCategory")
article_type_map, inv_article_map    = build_mapping(train_dataset_org, "articleType")
base_colour_map, inv_base_colour_map = build_mapping(train_dataset_org, "baseColour")
usage_map, inv_usage_map             = build_mapping(train_dataset_org, "usage")
brand_map, inv_brand_map             = build_brand_mapping(train_dataset_org)

In [None]:
def update_categorical(example):
    # Thay thế string bằng ID
    example["gender"]         = gender_map[example["gender"]]
    example["masterCategory"] = master_category_map[example["masterCategory"]]
    example["subCategory"]    = sub_category_map[example["subCategory"]]
    example["articleType"]    = article_type_map[example["articleType"]]
    example["baseColour"]     = base_colour_map[example["baseColour"]]
    example["usage"]          = usage_map[example["usage"]]
    # brand
    extracted          = extract_brand(example["productDisplayName"], inv_gender_map[example["gender"]])
    example["brand"]   = brand_map.get(extracted, -1)  # -1 nếu ko match
    return example

In [None]:
train_dataset = train_dataset.map(update_categorical)
val_dataset   = val_dataset.map(update_categorical)
test_dataset  = test_dataset.map(update_categorical)

In [None]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()

def scale_dataset(ds, scaler):
    arr = np.array([ds[col] for col in ["year","price_in_vnd"]]).T
    scaled = scaler.transform(arr)
    def f(ex, idx):
        return {
            "year": scaled[idx][0],
            "price_in_vnd": scaled[idx][1]
        }
    ds = ds.map(f, with_indices=True)
    return ds

# Fit scaler trên train
arr_train = np.array([train_dataset[col] for col in ["year","price_in_vnd"]]).T
scaled_train = min_max_scaler.fit_transform(arr_train)

In [None]:
def f_train(ex, idx):
    return {
        "year": scaled_train[idx][0],
        "price_in_vnd": scaled_train[idx][1]
    }
train_dataset = train_dataset.map(f_train, with_indices=True)

# Transform val, test
val_dataset  = scale_dataset(val_dataset, min_max_scaler)
test_dataset = scale_dataset(test_dataset, min_max_scaler)

In [None]:
# Encode season thủ công
season_mapping = {'Spring': 0, 'Summer': 1, 'Fall': 2, 'Winter': 3}
train_dataset = train_dataset.map(lambda x: {'season': season_mapping[x['season']]})
val_dataset   = val_dataset.map(lambda x: {'season': season_mapping[x['season']]})
test_dataset  = test_dataset.map(lambda x: {'season': season_mapping[x['season']]})


In [None]:
# Bỏ cột productDisplayName nếu muốn (đỡ cồng kềnh)
train_dataset = train_dataset.remove_columns(["productDisplayName"])
val_dataset   = val_dataset.remove_columns(["productDisplayName"])
test_dataset  = test_dataset.remove_columns(["productDisplayName"])


In [None]:
train_dataset.features

{'id': Value(dtype='int64', id=None),
 'gender': Value(dtype='int64', id=None),
 'masterCategory': Value(dtype='int64', id=None),
 'subCategory': Value(dtype='int64', id=None),
 'articleType': Value(dtype='int64', id=None),
 'baseColour': Value(dtype='int64', id=None),
 'season': Value(dtype='int64', id=None),
 'year': Value(dtype='float64', id=None),
 'usage': Value(dtype='int64', id=None),
 'image': Image(mode=None, decode=True, id=None),
 'price_in_vnd': Value(dtype='float64', id=None),
 'brand': Value(dtype='int64', id=None)}

In [None]:
print("Train sample:", train_dataset[0])

Train sample: {'id': 32762, 'gender': 2, 'masterCategory': 1, 'subCategory': 38, 'articleType': 103, 'baseColour': 2, 'season': 1, 'year': 0.4166666666666572, 'usage': 2, 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=60x80 at 0x7999482E0CD0>, 'price_in_vnd': 0.15022603799024467, 'brand': 405}


# III. CHUẨN BỊ DataLoader

In [None]:
# Tạo transform cho ảnh
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
])
val_test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

In [None]:
class FashionDataset(Dataset):
    """
    Dataset trả về (image_tensor, numeric_tensor, labels_dict)
    """
    def __init__(self, hf_dataset, transform=None):
        self.dataset = hf_dataset
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        # 1) Ảnh
        image = item["image"].convert("RGB")
        if self.transform:
            image = self.transform(image)

        # 2) Numeric features (season, year, price_in_vnd)
        #    -> float tensor
        #    -> Bạn có thể thêm bớt cột tuỳ ý
        numeric_features = [
            item["season"],       # float
            item["year"],         # float (đã scale)
            item["price_in_vnd"]  # float (đã scale)
        ]
        numeric_features = torch.tensor(numeric_features, dtype=torch.float)

        # 3) Labels (7 đầu ra multi-task)
        labels = {}
        labels["gender"]         = torch.tensor(item["gender"], dtype=torch.long)
        labels["masterCategory"] = torch.tensor(item["masterCategory"], dtype=torch.long)
        labels["usage"]          = torch.tensor(item["usage"], dtype=torch.long)
        labels["subCategory"]    = torch.tensor(item["subCategory"], dtype=torch.long)
        labels["articleType"]    = torch.tensor(item["articleType"], dtype=torch.long)
        labels["baseColour"]     = torch.tensor(item["baseColour"], dtype=torch.long)
        labels["brand"]          = torch.tensor(item["brand"], dtype=torch.long)

        return image, numeric_features, labels

In [None]:
# Tạo Torch Dataset & DataLoader
train_torch_dataset = FashionDataset(train_dataset, transform=train_transform)
val_torch_dataset   = FashionDataset(val_dataset,   transform=val_test_transform)
test_torch_dataset  = FashionDataset(test_dataset,  transform=val_test_transform)

In [None]:
train_loader = DataLoader(train_torch_dataset, batch_size=32, shuffle=True,  num_workers=0)
val_loader   = DataLoader(val_torch_dataset,   batch_size=32, shuffle=False, num_workers=0)
test_loader  = DataLoader(test_torch_dataset,  batch_size=32, shuffle=False, num_workers=0)

print("Số batch train:", len(train_loader), "Số batch val:", len(val_loader))

Số batch train: 1116 Số batch val: 124


# IV. XÂY DỰNG CNN + MLP = FUSION MULTI-HEAD

In [None]:
num_gender         = len(gender_map)         # 5
num_master         = len(master_category_map)# 7
num_usage          = len(usage_map)          # 8
num_subcategory    = len(sub_category_map)   # 45
num_article        = len(article_type_map)   # 142
num_basecolour     = len(base_colour_map)    # 46
num_brand          = len(brand_map)          # ~820 (tuỳ dataset)

# Hoặc in ra cho chắc:
print("num_gender", num_gender)
print("num_master", num_master)
print("num_usage",  num_usage)
print("num_subcategory", num_subcategory)
print("num_article", num_article)
print("num_basecolour", num_basecolour)
print("num_brand", num_brand)

num_gender 5
num_master 7
num_usage 8
num_subcategory 45
num_article 141
num_basecolour 46
num_brand 817


In [None]:
class MultiModalResNet(nn.Module):
    """
    Mô hình CNN cho ảnh + MLP cho numeric_features (season, year, price_in_vnd).
    Kết hợp (concat) => 7 heads cho 7 task.
    """
    def __init__(self,
                 num_gender,
                 num_master,
                 num_usage,
                 num_sub,
                 num_article,
                 num_base,
                 num_brand,
                 pretrained=True,
                 mlp_hidden=32):
        super().__init__()

        # 1) Backbone CNN (ResNet18)
        self.backbone = models.resnet18(pretrained=pretrained)
        backbone_out = self.backbone.fc.in_features
        # Thay FC cuối = Identity để lấy feature vector [B, backbone_out]
        self.backbone.fc = nn.Identity()

        # 2) MLP cho numeric_features
        #    3 đầu vào: (season, year, price_in_vnd)
        #    => mlp_hidden dimension
        self.mlp = nn.Sequential(
            nn.Linear(3, mlp_hidden),
            nn.ReLU(),
            nn.Linear(mlp_hidden, mlp_hidden),
            nn.ReLU()
        )

        # 3) Tạo 7 "head" (fully-connected) từ vector concat (CNN + MLP)
        fused_dim = backbone_out + mlp_hidden
        self.head_gender   = nn.Linear(fused_dim, num_gender)
        self.head_master   = nn.Linear(fused_dim, num_master)
        self.head_usage    = nn.Linear(fused_dim, num_usage)
        self.head_sub      = nn.Linear(fused_dim, num_sub)
        self.head_article  = nn.Linear(fused_dim, num_article)
        self.head_base     = nn.Linear(fused_dim, num_base)
        self.head_brand    = nn.Linear(fused_dim, num_brand)

    def forward(self, images, numeric_data):
        # CNN features từ ảnh
        cnn_feats = self.backbone(images)  # shape [B, backbone_out]
        # MLP features từ numeric_data
        mlp_feats = self.mlp(numeric_data) # shape [B, mlp_hidden]

        # Fusion
        fused = torch.cat([cnn_feats, mlp_feats], dim=1)  # [B, backbone_out + mlp_hidden]

        # 7 heads => dict
        out_gender   = self.head_gender(fused)
        out_master   = self.head_master(fused)
        out_usage    = self.head_usage(fused)
        out_sub      = self.head_sub(fused)
        out_article  = self.head_article(fused)
        out_base     = self.head_base(fused)
        out_brand    = self.head_brand(fused)

        return {
            "gender":         out_gender,
            "masterCategory": out_master,
            "usage":          out_usage,
            "subCategory":    out_sub,
            "articleType":    out_article,
            "baseColour":     out_base,
            "brand":          out_brand
        }

In [None]:
# Khởi tạo model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model_cnn = MultiModalResNet(
    num_gender=num_gender,
    num_master=num_master,
    num_usage=num_usage,
    num_sub=num_subcategory,
    num_article=num_article,
    num_base=num_basecolour,
    num_brand=num_brand,
    pretrained=True,
    mlp_hidden=32
).to(device)



# V. TRAINING LOOP

In [None]:
optimizer = optim.Adam(model_cnn.parameters(), lr=1e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

In [None]:
def multi_task_loss(outputs, labels):
    total_loss = 0
    for key in labels:
        total_loss += F.cross_entropy(outputs[key], labels[key])
    return total_loss

In [None]:
def train_one_epoch(model, dataloader, optimizer):
    model.train()
    total_loss = 0
    for images, numeric_data, labels_dict in dataloader:
        images = images.to(device)
        numeric_data = numeric_data.to(device)
        for k in labels_dict:
            labels_dict[k] = labels_dict[k].to(device)

        optimizer.zero_grad()
        outputs = model(images, numeric_data)
        loss = multi_task_loss(outputs, labels_dict)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

In [None]:
def evaluate(model, dataloader):
    model.eval()
    total_loss = 0
    correct_counts = {k: 0 for k in ["gender","masterCategory","usage","subCategory","articleType","baseColour","brand"]}
    total_samples  = 0

    with torch.no_grad():
        for images, numeric_data, labels_dict in dataloader:
            images = images.to(device)
            numeric_data = numeric_data.to(device)
            for k in labels_dict:
                labels_dict[k] = labels_dict[k].to(device)

            outputs = model(images, numeric_data)
            loss = multi_task_loss(outputs, labels_dict)
            total_loss += loss.item()

            batch_size = images.size(0)
            total_samples += batch_size
            for k in labels_dict:
                preds = torch.argmax(outputs[k], dim=1)
                correct_counts[k] += (preds == labels_dict[k]).sum().item()

    avg_loss = total_loss / len(dataloader)
    accuracies = {}
    for k in correct_counts:
        accuracies[k] = correct_counts[k] / float(total_samples)
    return avg_loss, accuracies

In [None]:
# for i in range(len(train_torch_dataset)):
#     try:
#         sample = train_torch_dataset[i]
#         # sample = (image, numeric_features, labels_dict)
#         image_shape = sample[0].size()
#         numeric_shape = sample[1].size()
#         print(f"Sample {i} OK => Image shape: {image_shape}, numeric shape: {numeric_shape}")
#     except Exception as e:
#         print(f"Sample {i} LỖI =>", e)
#         break


In [None]:
# Train
num_epochs = 10
for epoch in range(num_epochs):
    train_loss = train_one_epoch(model_cnn, train_loader, optimizer)
    val_loss, val_accs = evaluate(model_cnn, val_loader)
    scheduler.step()

    print(f"Epoch [{epoch+1}/{num_epochs}]")
    print(f"   Train Loss: {train_loss:.4f}")
    print(f"   Val   Loss: {val_loss:.4f}")
    for task, acc in val_accs.items():
        print(f"       Val {task} Acc: {acc*100:.2f}%")

Epoch [1/10]
   Train Loss: 7.6668
   Val   Loss: 5.5468
       Val gender Acc: 90.22%
       Val masterCategory Acc: 99.07%
       Val usage Acc: 90.52%
       Val subCategory Acc: 94.50%
       Val articleType Acc: 83.29%
       Val baseColour Acc: 64.31%
       Val brand Acc: 30.70%
Epoch [2/10]
   Train Loss: 4.9968
   Val   Loss: 4.7316
       Val gender Acc: 92.24%
       Val masterCategory Acc: 99.29%
       Val usage Acc: 91.56%
       Val subCategory Acc: 95.64%
       Val articleType Acc: 86.54%
       Val baseColour Acc: 67.00%
       Val brand Acc: 37.84%
Epoch [3/10]
   Train Loss: 4.1019
   Val   Loss: 4.3482
       Val gender Acc: 91.50%
       Val masterCategory Acc: 99.40%
       Val usage Acc: 91.56%
       Val subCategory Acc: 95.84%
       Val articleType Acc: 87.42%
       Val baseColour Acc: 68.11%
       Val brand Acc: 45.10%
Epoch [4/10]
   Train Loss: 3.4687
   Val   Loss: 4.1041
       Val gender Acc: 92.97%
       Val masterCategory Acc: 99.37%
       Val usa

# VI. ĐÁNH GIÁ TRÊN TEST

In [None]:
test_loss, test_accs = evaluate(model_cnn, test_loader)
print(f"[Test] Loss: {test_loss:.4f}")
for task, acc in test_accs.items():
    print(f"[Test] {task} Accuracy = {acc*100:.2f}%")

[Test] Loss: 3.4597
[Test] gender Accuracy = 93.06%
[Test] masterCategory Accuracy = 99.66%
[Test] usage Accuracy = 92.92%
[Test] subCategory Accuracy = 97.19%
[Test] articleType Accuracy = 89.36%
[Test] baseColour Accuracy = 71.14%
[Test] brand Accuracy = 59.64%


# VII. LƯU MODEL

In [None]:
torch.save(model_cnn.state_dict(), "cnn_plus_mlp_multitask_model.pt")
print("Model weights saved to cnn_plus_mlp_multitask_model.pt")

Model weights saved to cnn_plus_mlp_multitask_model.pt


In [None]:
def find_best_pdname(
    df_all,
    pred_brand_str,
    pred_gender_str,
    pred_usage_str,
    pred_base_str,
    pred_article_str,
    pred_sub_str,
    pred_master_str
):
    """
    Tìm 1 row trong df_all có 7 cột (brand, gender, usage, baseColour, articleType, subCategory, masterCategory)
    khớp/tương đồng nhất với các giá trị đã dự đoán, sau đó trả về productDisplayName.
    """

    best_score = -1
    best_pdname = None

    for idx, row in df_all.iterrows():
        score = 0
        # brand
        if row["brand"] == pred_brand_str:
            score += 1
        # gender
        if row["gender"] == pred_gender_str:
            score += 1
        # usage
        if row["usage"] == pred_usage_str:
            score += 1
        # baseColour
        if row["baseColour"] == pred_base_str:
            score += 1
        # articleType
        if row["articleType"] == pred_article_str:
            score += 1
        # subCategory
        if row["subCategory"] == pred_sub_str:
            score += 1
        # masterCategory
        if row["masterCategory"] == pred_master_str:
            score += 1

        if score > best_score:
            best_score = score
            best_pdname = row["productDisplayName"]

    return best_pdname, best_score


In [None]:
pred_brand_str  = inv_brand_map[pred_brand_id]
pred_gender_str = inv_gender_map[pred_gender_id]
pred_usage_str  = inv_usage_map[pred_usage_id]
pred_base_str   = inv_base_colour_map[pred_base_id]
pred_article_str= inv_article_type_map[pred_article_id]
pred_sub_str    = inv_sub_category_map[pred_sub_id]
pred_master_str = inv_master_map[pred_master_id]

pdname, score = find_best_pdname(
    df_all,
    pred_brand_str,
    pred_gender_str,
    pred_usage_str,
    pred_base_str,
    pred_article_str,
    pred_sub_str,
    pred_master_str
)

print("Best matched productDisplayName:", pdname)
print("Match score =", score)
