<a href="https://colab.research.google.com/github/RaphaelCarvalh/BootCampAVANTI_machine_learning/blob/ativ04-et02-analise-dataset/Et01_analise_dataset_arvore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Projeto Clothing Co-Parsing - Etapa 1 - Notebook: # Notebook: Extração de Features + Árvore de Decisão

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import cv2

from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt

# 1. Carregar dataset já limpo



In [7]:
df = pd.read_csv("df_clean.csv")
print(f"Total de imagens (antes): {len(df)}")

# Selecionar apenas as 5 maiores classes
top5_classes = df['label'].value_counts().nlargest(5).index
df_top5 = df[df['label'].isin(top5_classes)]
print("Classes selecionadas:", list(top5_classes))
print(f"Total de imagens (top 5 classes): {len(df_top5)}")

FileNotFoundError: [Errno 2] No such file or directory: 'df_clean.csv'

# 2. Dataset PyTorch



In [None]:
class ClothingDataset(Dataset):
    def __init__(self, data, transform):
        self.data = data.values
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path, label = self.data[idx, 0], self.data[idx, 1]
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        if self.transform:
            img = self.transform(img)
        return img, label

# Transformações
IMG_SIZE = 224
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# Criar dataset e loader (amostra menor para estudo)
subset_df = df_top5.sample(500, random_state=42)  # reduz para acelerar
dataset = ClothingDataset(subset_df, transform)
loader = DataLoader(dataset, batch_size=16, shuffle=False)



# 3. Carregar modelo pré-treinado como extrator

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.resnet18(weights="IMAGENET1K_V1")
model.fc = nn.Identity()  # remove a última camada de classificação
model = model.to(device)
model.eval()


Treino: 1678 | Validação: 210 | Teste: 210


# 4. Extrair embeddings

In [None]:
features, labels = [], []

with torch.no_grad():
    for imgs, lbls in loader:
        imgs = imgs.to(device)
        outputs = model(imgs)
        features.append(outputs.cpu())
        labels.extend(lbls)

X = torch.cat(features).numpy()
y = labels

print(f"Features extraídas: {X.shape}, Labels: {len(y)}")

Train batches: 53, Val batches: 7, Test batches: 7


# 5. Treinar árvore de decisão

In [None]:
clf = DecisionTreeClassifier(max_depth=3, random_state=42)
clf.fit(X, y)


# 6. Plotar árvore

In [None]:
plt.figure(figsize=(20,10))
plot_tree(
    clf,
    filled=True,
    feature_names=[f"f{i}" for i in range(X.shape[1])],
    class_names=[str(c) for c in clf.classes_],
    max_depth=3
)
plt.show()