## Import dataset

In [None]:
ROOT_DIR = "dataset/fruit360"
TRAIN_DIR = os.path.join(ROOT_DIR, "Training")
TEST_DIR = os.path.join(ROOT_DIR, "Test")

GITHUB_REPO = "https://github.com/fruits-360/fruits-360-100x100"
CLONE_DIR = "dataset/fruits-360-100x100"

def download_dataset():
    os.makedirs("dataset", exist_ok=True)
    subprocess.run(["git", "clone", GITHUB_REPO, CLONE_DIR], check=True)
    os.makedirs(ROOT_DIR, exist_ok=True)
    shutil.move(os.path.join(CLONE_DIR, "Training"), TRAIN_DIR)
    shutil.move(os.path.join(CLONE_DIR, "Test"), TEST_DIR)
    shutil.rmtree(CLONE_DIR, ignore_errors=True)

if not os.path.exists(ROOT_DIR):
    download_dataset()

assert os.path.exists(TRAIN_DIR)
assert os.path.exists(TEST_DIR)

print(f"Train dir: {TRAIN_DIR}")
print(f"Test dir: {TEST_DIR}")

# Dataloader

In [None]:
class FruitFolderDataset(Dataset):
    """
    Dataset to load and preprocess the fruit images from folder structure.

    root_dir: path to Training/ or Test/
    variety:  False -> macro label (Apple, Banana, ...)
              True  -> fine-grained label (Apple Braeburn, ...)
    """
    def __init__(self, root_dir, transform=None, variety=False):
        self.root_dir = root_dir
        self.transform = transform
        self.variety = variety
        self.samples = []

        for class_name in sorted(os.listdir(root_dir)):
            class_dir = os.path.join(root_dir, class_name)
            if not os.path.isdir(class_dir):
                continue

            label_str = class_name if variety else class_name.split()[0]

            for img_name in os.listdir(class_dir):
                if img_name.lower().endswith((".jpg", ".png")):
                    self.samples.append((os.path.join(class_dir, img_name), label_str))

        self.labels = sorted({lbl for _, lbl in self.samples})
        self.label_to_idx = {lbl: i for i, lbl in enumerate(self.labels)}
        self.idx_to_label = {i: lbl for lbl, i in self.label_to_idx.items()}

        print(f"{os.path.basename(root_dir)} -> {len(self.samples)} images, {len(self.labels)} classes")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_path, label_str = self.samples[idx]
        image = Image.open(img_path).convert("RGB")

        if self.transform is not None:
            img = self.transform(image)
        else:
            img = image

        label_idx = self.label_to_idx[label_str]
        return img, label_idx


# VARS

In [None]:
# VARS
size = 8
random_state = 42
batch = 100

## 1. BIG AHH DATASET Aka Totale

In [None]:
val_transform = T.Compose([
    T.Resize((size, size)),
    T.ToTensor(),
])

full_train_dataset = FruitFolderDataset(TRAIN_DIR, transform=val_transform, variety=False)
test_dataset = FruitFolderDataset(TEST_DIR, transform=val_transform, variety=False)

train_size = int(0.7 * len(full_train_dataset))
val_size = len(full_train_dataset) - train_size

train_dataset, val_dataset = random_split(
    full_train_dataset,
    [train_size, val_size],
    generator=torch.Generator().manual_seed(RANDOM_STATE)
)

print("Train size:", len(train_dataset))
print("Validation size:", len(val_dataset))
print("Test size:", len(test_dataset))
print("Classes:", len(full_train_dataset.label_to_idx))

In [None]:
train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch, shuffle=False)

### 1.1 BIG AHH DATASET BUT RACISTS Aka Fine-Grained

In [None]:
full_train_dataset_fg = FruitFolderDataset(TRAIN_DIR, transform=val_transform, variety=True)
test_dataset_fg = FruitFolderDataset(TEST_DIR, transform=val_transform, variety=True)

train_size_fg = int(0.7 * len(full_train_dataset_fg))
val_size_fg = len(full_train_dataset_fg) - train_size_fg

train_dataset_fg, val_dataset_fg = random_split(
    full_train_dataset_fg,
    [train_size_fg, val_size_fg],
    generator=torch.Generator().manual_seed(RANDOM_STATE)
)

train_loader_fg = DataLoader(train_dataset_fg, batch_size=batch, shuffle=True)
val_loader_fg = DataLoader(val_dataset_fg, batch_size=batch, shuffle=False)
test_loader_fg = DataLoader(test_dataset_fg, batch_size=batch, shuffle=False)

print("Fine-grained -> Train:", len(train_dataset_fg),
      "Val:", len(val_dataset_fg),
      "Test:", len(test_dataset_fg),
      "Classes:", len(full_train_dataset_fg.label_to_idx))


## 2. DOMAIN FILTERING

In [None]:
# 2. DOMAIN FILTERING (solo frutta, escludendo verdure)

vegetables = ['Beans', 'Beetroot', 'Cabbage', 'Carrot', 'Cauliflower', 'Corn',
              'Cucumber', 'Eggplant', 'Ginger', 'Kohlrabi', 'Onion', 'Pepper',
              'Potato', 'Tomato', 'Zucchini']

def filter_labels(dataset, drop_labels):
    keep_indices = []
    for idx in range(len(dataset.samples)):
        _, label_str = dataset.samples[idx]
        if label_str not in drop_labels:
            keep_indices.append(idx)
    print(f"Kept {len(keep_indices)} / {len(dataset.samples)} samples")
    # creiamo un "view" semplice usando un sottoinsieme di samples
    filtered = FruitFolderDataset(dataset.root_dir, transform=dataset.transform, variety=False)
    filtered.samples = [dataset.samples[i] for i in keep_indices]
    filtered.labels = sorted({lbl for _, lbl in filtered.samples})
    filtered.label_to_idx = {lbl: i for i, lbl in enumerate(filtered.labels)}
    filtered.idx_to_label = {i: lbl for lbl, i in filtered.label_to_idx.items()}
    return filtered

full_train_dataset_fruit = FruitFolderDataset(TRAIN_DIR, transform=val_transform, variety=False)
full_train_dataset_fruit = filter_labels(full_train_dataset_fruit, vegetables)

test_dataset_fruit = FruitFolderDataset(TEST_DIR, transform=val_transform, variety=False)
test_dataset_fruit = filter_labels(test_dataset_fruit, vegetables)

train_size_fruit = int(0.7 * len(full_train_dataset_fruit))
val_size_fruit = len(full_train_dataset_fruit) - train_size_fruit

train_dataset_fruit, val_dataset_fruit = random_split(
    full_train_dataset_fruit,
    [train_size_fruit, val_size_fruit],
    generator=torch.Generator().manual_seed(RANDOM_STATE)
)

print("Train size:", len(train_dataset_fruit))
print("Validation size:", len(val_dataset_fruit))
print("Test size:", len(test_dataset_fruit))
print("Fruit-only classes:", len(full_train_dataset_fruit.label_to_idx))

train_loader_fruit = DataLoader(train_dataset_fruit, batch_size=batch, shuffle=True)
val_loader_fruit = DataLoader(val_dataset_fruit, batch_size=batch, shuffle=False)
test_loader_fruit = DataLoader(test_dataset_fruit, batch_size=batch, shuffle=False)


# 3. LOW SAMPLES

In [None]:
def create_subset_folder(dataset, subset_type='none',
                         samples_per_label=None,
                         subset_fraction=None,
                         total_samples=None,
                         random_state=RANDOM_STATE):
    
    rng = torch.Generator().manual_seed(random_state)

    label_to_indices = {}
    for idx, (_, label_str) in enumerate(dataset.samples):
        label_to_indices.setdefault(label_str, []).append(idx)

    selected_indices = []

    if subset_type == 'per_label' and samples_per_label is not None:
        for label, idxs in label_to_indices.items():
            idxs_tensor = torch.tensor(idxs)
            perm = torch.randperm(len(idxs_tensor), generator=rng)
            take = min(len(idxs_tensor), samples_per_label)
            chosen = idxs_tensor[perm[:take]].tolist()
            selected_indices.extend(chosen)
        print(f"Subset with {samples_per_label} samples per label")

    elif subset_type == 'fraction' and subset_fraction is not None:
        all_indices = torch.arange(len(dataset.samples))
        perm = torch.randperm(len(all_indices), generator=rng)
        take = int(len(all_indices) * subset_fraction)
        selected_indices = all_indices[perm[:take]].tolist()
        print(f"Subset with {subset_fraction*100}% of the dataset")

    elif subset_type == 'total' and total_samples is not None:
        all_indices = torch.arange(len(dataset.samples))
        perm = torch.randperm(len(all_indices), generator=rng)
        take = min(len(all_indices), total_samples)
        selected_indices = all_indices[perm[:take]].tolist()
        print(f"Subset with {total_samples} total samples")

    else:
        print("No subset applied")
        selected_indices = list(range(len(dataset.samples)))

    subset = FruitFolderDataset(dataset.root_dir, transform=dataset.transform, variety=False)
    subset.samples = [dataset.samples[i] for i in selected_indices]
    subset.labels = sorted({lbl for _, lbl in subset.samples})
    subset.label_to_idx = {lbl: i for i, lbl in enumerate(subset.labels)}
    subset.idx_to_label = {i: lbl for lbl, i in subset.label_to_idx.items()}

    print(f"\nFinal subset: {len(subset.samples)} samples")
    print(f"Total labels: {len(subset.labels)}")
    return subset

# esempio: 500 immagini per label sul dataset macro (tutte le categorie)
base_dataset = FruitFolderDataset(TRAIN_DIR, transform=val_transform, variety=False)
subset_dataset = create_subset_folder(
    base_dataset,
    subset_type='per_label',
    samples_per_label=500,
    random_state=RANDOM_STATE,
)

train_size_sub = int(0.7 * len(subset_dataset))
val_size_sub = len(subset_dataset) - train_size_sub

train_dataset_sub, val_dataset_sub = random_split(
    subset_dataset,
    [train_size_sub, val_size_sub],
    generator=torch.Generator().manual_seed(RANDOM_STATE)
)

test_dataset_sub = FruitFolderDataset(TEST_DIR, transform=val_transform, variety=False)

print("\nTrain size:", len(train_dataset_sub))
print("Validation size:", len(val_dataset_sub))
print("Test size:", len(test_dataset_sub))

train_loader_sub = DataLoader(train_dataset_sub, batch_size=batch, shuffle=True)
val_loader_sub = DataLoader(val_dataset_sub, batch_size=batch, shuffle=False)
test_loader_sub = DataLoader(test_dataset_sub, batch_size=batch, shuffle=False)
