# How to do majority voting and filtering

So I get stuck on how to filter out non relevant concepts, turned out my filter only had 84 classes left while the paper clearly state 112 classes. 


# Data loading 

In [2]:
from pathlib import Path
from torch.utils.data import Dataset
from torchvision import transforms
from torchvision.io import read_image
import torch
import numpy as np
import pandas as pd

In [3]:
concept_names = []
consept_name_path=r"data\CUB_200_2011\atributes.txt"

with open(consept_name_path, 'r') as f:
    for line in f:
        concept_names.append(line.split()[1])

concept_names = np.array(concept_names)
concept_names

array(['has_bill_shape::curved_(up_or_down)', 'has_bill_shape::dagger',
       'has_bill_shape::hooked', 'has_bill_shape::needle',
       'has_bill_shape::hooked_seabird', 'has_bill_shape::spatulate',
       'has_bill_shape::all-purpose', 'has_bill_shape::cone',
       'has_bill_shape::specialized', 'has_wing_color::blue',
       'has_wing_color::brown', 'has_wing_color::iridescent',
       'has_wing_color::purple', 'has_wing_color::rufous',
       'has_wing_color::grey', 'has_wing_color::yellow',
       'has_wing_color::olive', 'has_wing_color::green',
       'has_wing_color::pink', 'has_wing_color::orange',
       'has_wing_color::black', 'has_wing_color::white',
       'has_wing_color::red', 'has_wing_color::buff',
       'has_upperparts_color::blue', 'has_upperparts_color::brown',
       'has_upperparts_color::iridescent', 'has_upperparts_color::purple',
       'has_upperparts_color::rufous', 'has_upperparts_color::grey',
       'has_upperparts_color::yellow', 'has_upperparts_color

In [4]:
class CUB(Dataset):
    def __init__(self, data_dir=Path('data/CUB_200_2011'), train=True, majority_voting=False, concept_threshold=0):
        super(CUB, self).__init__()

        self.data_dir = data_dir
        self.train = train

        # Hardcode the transformation
        self.transform = transforms.Compose([
            transforms.RandomResizedCrop(299) if train else transforms.Resize((299, 299)),
            transforms.RandomHorizontalFlip() if train else transforms.Lambda(lambda x: x),
            transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1) if train else transforms.Lambda(lambda x: x),
            transforms.ConvertImageDtype(torch.float32),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

        # Load the dataset files
        self.images = np.loadtxt(data_dir / 'images.txt', dtype=str)
        self.split = np.loadtxt(data_dir / 'train_test_split.txt', dtype=int)[:,1]
        self.concepts = self.make_concept_list()
        self.original_concepts = self.concepts.copy()
        self.class_labels = np.loadtxt(data_dir / 'image_class_labels.txt', dtype=int)

        # Determine the number of classes dynamically
        self.num_classes = len(np.unique(self.class_labels[:, 1]))

        # Apply preprocessing steps if specified
        if majority_voting:
            self.concepts = self.majority_voting()

        if concept_threshold > 0:
            self.concepts, self.concepts_idx = self.filter_concepts(concept_threshold, majority_voting)
        else:
            self.concepts_idx = np.arange(self.concepts.shape[1])

        # Split the dataset into train and test sets
        if self.train:
            self.images = self.images[self.split == 1]
            self.class_labels = self.class_labels[self.split == 1]
            self.concepts = torch.tensor(self.concepts[self.split == 1], dtype=torch.float32)
        else:
            self.images = self.images[self.split == 0]
            self.class_labels = self.class_labels[self.split == 0]
            self.concepts = torch.tensor(self.concepts[self.split == 0], dtype=torch.float32)

    def make_concept_list(self):
        """
        Create a numpy array of the concepts for each image.
        
        Returns:
            numpy.ndarray: Array of concepts for all images.
        """
        concept_file = self.data_dir / 'attributes' / 'image_attribute_labels.txt'
        concepts = np.loadtxt(concept_file, dtype=int,usecols=(0, 1, 2))
        num_images = len(self.images)
        num_concepts = len(concepts) // num_images
        return concepts[:, 2].reshape(num_images, num_concepts)

    def __len__(self):
        """Return the total number of images in the dataset."""
        return len(self.images)

    def __getitem__(self, idx):
        """
        Fetch the image, concepts, and label for a given index.
        
        Args:
            idx (int): Index of the data point to fetch.
        
        Returns:
            tuple: (image, concepts, label)
        """
        image_path = self.data_dir / 'images' / self.images[idx][1]
        image = read_image(str(image_path))

        # If image is grayscale, convert to 3 channels 
        if image.shape[0] == 1:
            image = image.repeat(3, 1, 1)

        if self.transform:
            image = self.transform(image)

        concepts = self.concepts[idx]

        # Create one-hot encoding of the class label
        label = torch.zeros(self.num_classes, dtype=torch.float32)
        label[self.class_labels[idx][1] - 1] = 1.0

        return image, concepts, label
    
    def majority_voting(self):
        """
        Apply majority voting to concepts based on class labels.
        This assigns the most common concept values for each class to all instances of that class.
        
        Returns:
            numpy.ndarray: Updated concepts after majority voting.
        """
        df = pd.DataFrame(self.concepts)
        df['class'] = self.class_labels[:,1]
        majority = df.groupby('class').mean().round().values
        return majority[self.class_labels[:,1] - 1]
    
    def filter_concepts(self, threshold, majority_voting):
        """
        Filter concepts based on their prevalence in the dataset.
        
        This function assumes that majority voting has already been applied if majority_voting is True.
        
        Args:
            threshold (float): Consept with less than trheshold prevalence will be removed.
                If majority_voting is True, concepts are grouped by class and the threshold is applied to the class prevalence in each class.
                If False, the threshold is applied to the overall prevalence of each concept in each image.
            majority_voting (bool): Indicates whether majority voting has been applied.
        
        Returns:
            numpy.ndarray: Filtered concepts, where each column represents a concept that meets the threshold criteria
            numpy.ndarray: Indices of the kept concepts
        """
        if majority_voting:
            df = pd.DataFrame(self.concepts)
            df['class'] = self.class_labels[:,1]
            prevalence = df.groupby('class').mean().sum(axis=0).values
            keep_concepts = prevalence >= threshold

        else:
            prevalence = self.concepts.sum(axis=0)
            keep_concepts = prevalence > threshold

        filtered_concepts = self.concepts[:, keep_concepts]
        kept_indices = np.arange(self.concepts.shape[1])[keep_concepts]

        # Add this print statement
        print(f"Number of concepts after filtering: {filtered_concepts.shape[1]} (originally {self.concepts.shape[1]})")

        return filtered_concepts, kept_indices

In [5]:
data_path = "data/CUB_200_2011"  # Replace with the actual path to your dataset

# Create train and validation datasets
train_dataset = CUB(data_dir=Path(data_path), train=True, majority_voting=True, concept_threshold=10)


print(f"Train dataset size: {len(train_dataset)}")


# Get a sample from the dataset
sample_image, sample_concepts, sample_label = train_dataset[0]

print(f"Sample concepts shape: {sample_concepts.shape}")
concepts_idx = np.array(train_dataset.concepts_idx, dtype=int)
print(f"consept idx : {concepts_idx}")
print(f"consepts_left : {concept_names[concepts_idx]}")

Number of concepts after filtering: 82 (originally 312)
Train dataset size: 5994
Sample concepts shape: torch.Size([82])
consept idx : [  1   4   6   7  10  14  20  21  23  25  29  30  35  36  38  44  45  50
  51  53  54  59  63  69  70  75  90  91 101 111 116 117 119 126 131 132
 145 149 151 153 157 158 163 164 178 179 183 187 188 193 194 203 208 209
 211 218 220 221 235 236 240 244 249 253 254 259 260 262 268 274 277 283
 289 293 294 298 299 304 305 308 310 311]
consepts_left : ['has_bill_shape::dagger' 'has_bill_shape::hooked_seabird'
 'has_bill_shape::all-purpose' 'has_bill_shape::cone'
 'has_wing_color::brown' 'has_wing_color::grey' 'has_wing_color::black'
 'has_wing_color::white' 'has_wing_color::buff'
 'has_upperparts_color::brown' 'has_upperparts_color::grey'
 'has_upperparts_color::yellow' 'has_upperparts_color::black'
 'has_upperparts_color::white' 'has_upperparts_color::buff'
 'has_underparts_color::grey' 'has_underparts_color::yellow'
 'has_underparts_color::black' 'has_und

# Their implantation  

In [6]:
import os
import random
import pickle
import argparse
from os import listdir
from os.path import isfile, isdir, join
from collections import defaultdict as ddict
import copy

N_ATTRIBUTES = 312
N_CLASSES = 200

In [7]:
# From data_prossing.py
def extract_data(data_dir):
    cwd = os.getcwd()
    data_path = join(cwd,data_dir,'images')
    val_ratio = 0.2

    path_to_id_map = dict() #map from full image path to image id
    with open(data_path.replace('images', 'images.txt'), 'r') as f:
        for line in f:
            items = line.strip().split()
            path_to_id_map[join(data_path, items[1])] = int(items[0])
    

    attribute_labels_all = ddict(list) #map from image id to a list of attribute labels
    attribute_certainties_all = ddict(list) #map from image id to a list of attribute certainties
    attribute_uncertain_labels_all = ddict(list) #map from image id to a list of attribute labels calibrated for uncertainty
    # 1 = not visible, 2 = guessing, 3 = probably, 4 = definitely
    uncertainty_map = {1: {1: 0, 2: 0.5, 3: 0.75, 4:1}, #calibrate main label based on uncertainty label
                        0: {1: 0, 2: 0.5, 3: 0.25, 4: 0}}
    with open(join(cwd, data_dir + '/attributes/image_attribute_labels.txt'), 'r') as f:
        for line in f:
            file_idx, attribute_idx, attribute_label, attribute_certainty = line.strip().split()[:4]
            attribute_label = int(attribute_label)
            attribute_certainty = int(attribute_certainty)
            uncertain_label = uncertainty_map[attribute_label][attribute_certainty]
            attribute_labels_all[int(file_idx)].append(attribute_label)
            attribute_uncertain_labels_all[int(file_idx)].append(uncertain_label)
            attribute_certainties_all[int(file_idx)].append(attribute_certainty)

    is_train_test = dict() #map from image id to 0 / 1 (1 = train)
    with open(join(cwd, data_dir + '/train_test_split.txt'), 'r') as f:
        for line in f:
            idx, is_train = line.strip().split()
            is_train_test[int(idx)] = int(is_train)
    print("Number of train images from official train test split:", sum(list(is_train_test.values())))

    train_val_data, test_data = [], []
    train_data, val_data = [], []
    folder_list = [f for f in listdir(data_path) if isdir(join(data_path, f))]
    folder_list.sort() #sort by class index
    for i, folder in enumerate(folder_list):
        folder_path = join(data_path, folder)
        classfile_list = [cf for cf in listdir(folder_path) if (isfile(join(folder_path,cf)) and cf[0] != '.')]
        #classfile_list.sort()
        for cf in classfile_list:
            img_id = path_to_id_map[join(folder_path+'/'+cf)] #may cause bug in linux
            img_path = join(folder_path, cf)
            metadata = {'id': img_id, 'img_path': img_path, 'class_label': i,
                      'attribute_label': attribute_labels_all[img_id], 'attribute_certainty': attribute_certainties_all[img_id],
                      'uncertain_attribute_label': attribute_uncertain_labels_all[img_id]}
            if is_train_test[img_id]:
                train_val_data.append(metadata)

                val_data.append(metadata)

                train_data.append(metadata)
            else:
                test_data.append(metadata)

    random.shuffle(train_val_data)
    split = int(val_ratio * len(train_val_data))
    train_data = train_val_data[split :]
    val_data = train_val_data[: split]
    print('Size of train set:', len(train_data))
    return train_data, val_data, test_data

train_data, val_data, test_data = extract_data(r"data\CUB_200_2011")

Number of train images from official train test split: 5994
Size of train set: 4796


In [38]:
#From generating_new_dataset.py

def get_class_attributes_data(min_class_count, out_dir, modify_data_dir='', keep_instance_data=False):
    """
    Use train.pkl to aggregate attributes on class level and only keep those that are predominantly 1 for at least min_class_count classes
    Transform data in modify_data_dir file using the class attribute statistics and save the new dataset to out_dir
    If keep_instance_data is True, then retain the original values of the selected attributes. Otherwise save aggregated class level attributes
    In our paper, we set min_class_count to be 10 and only use the following 112 attributes of indices 
    [1, 4, 6, 7, 10, 14, 15, 20, 21, 23, 25, 29, 30, 35, 36, 38, 40, 44, 45, 50, 51, 53, 54, 56, 57, 59, 63, 64, 69, 70, 72, 75, 80, 84, 90, 91, \
    93, 99, 101, 106, 110, 111, 116, 117, 119, 125, 126, 131, 132, 134, 145, 149, 151, 152, 153, 157, 158, 163, 164, 168, 172, 178, 179, 181, \
    183, 187, 188, 193, 194, 196, 198, 202, 203, 208, 209, 211, 212, 213, 218, 220, 221, 225, 235, 236, 238, 239, 240, 242, 243, 244, 249, 253, \
    254, 259, 260, 262, 268, 274, 277, 283, 289, 292, 293, 294, 298, 299, 304, 305, 308, 309, 310, 311]
    """
    data = pickle.load(open(r'data\CUB_processed\unfiltered\train.pkl', 'rb'))
    #data = train_data #Don't save files for Jubitor eksample
    class_attr_count = np.zeros((N_CLASSES, N_ATTRIBUTES, 2))
    for d in data:
        class_label = d['class_label']
        certainties = d['attribute_certainty']
        for attr_idx, a in enumerate(d['attribute_label']):
            if a == 0 and certainties[attr_idx] == 1: #not visible
                continue
            class_attr_count[class_label][attr_idx][a] += 1

    class_attr_min_label = np.argmin(class_attr_count, axis=2)
    class_attr_max_label = np.argmax(class_attr_count, axis=2)
    equal_count = np.where(class_attr_min_label == class_attr_max_label) #check where 0 count = 1 count, set the corresponding class attribute label to be 1
    class_attr_max_label[equal_count] = 1

    attr_class_count = np.sum(class_attr_max_label, axis=0)
    mask = np.where(attr_class_count >= min_class_count)[0] #select attributes that are present (on a class level) in at least [min_class_count] classes
    class_attr_label_masked = class_attr_max_label[:, mask]
    if keep_instance_data:
        collapse_fn = lambda d: list(np.array(d['attribute_label'])[mask])
    else:
        collapse_fn = lambda d: list(class_attr_label_masked[d['class_label'], :])
    return mask


def create_new_dataset(out_dir, field_change, compute_fn, datasets=['train', 'val', 'test'], data_dir=''):
    """
    Generic function that given datasets stored in data_dir, modify/ add one field of the metadata in each dataset based on compute_fn
                          and save the new datasets to out_dir
    compute_fn should take in a metadata object (that includes 'img_path', 'class_label', 'attribute_label', etc.)
                          and return the updated value for field_change
    """

    data = train_data
    new_data = []
    for d in data:
        new_d = copy.deepcopy(d)
        new_value = compute_fn(d)
        if field_change in d:
            old_value = d[field_change]
            assert (type(old_value) == type(new_value))
        new_d[field_change] = new_value
        new_data.append(new_d)
    return new_data



In [41]:
mask = get_class_attributes_data(10, "data/CUB_200_2011", r"data\CUB_200_2011", keep_instance_data=False)

print(len(mask))
print(mask)
print(concept_names[mask])
all_indexes = set(range(312))
existing_indexes_set = set(mask)
missing_indexes = list(all_indexes - existing_indexes_set)
print(f"Left out {concept_names[missing_indexes]}" )

110
[  1   4   6   7  10  14  15  20  21  23  25  29  30  35  36  38  44  45
  50  51  53  54  56  57  59  63  64  69  70  72  75  80  84  90  91  93
  99 101 106 110 111 116 117 119 125 126 131 132 134 145 149 151 153 157
 158 163 164 168 172 173 178 179 181 182 183 187 188 193 194 196 202 203
 208 209 211 212 213 218 220 221 235 236 238 239 240 242 243 244 249 253
 254 259 260 262 268 274 277 283 289 292 293 294 298 299 304 305 308 309
 310 311]
['has_bill_shape::dagger' 'has_bill_shape::hooked_seabird'
 'has_bill_shape::all-purpose' 'has_bill_shape::cone'
 'has_wing_color::brown' 'has_wing_color::grey' 'has_wing_color::yellow'
 'has_wing_color::black' 'has_wing_color::white' 'has_wing_color::buff'
 'has_upperparts_color::brown' 'has_upperparts_color::grey'
 'has_upperparts_color::yellow' 'has_upperparts_color::black'
 'has_upperparts_color::white' 'has_upperparts_color::buff'
 'has_underparts_color::grey' 'has_underparts_color::yellow'
 'has_underparts_color::black' 'has_underparts_

In [10]:
(np.array(mask)-1)

array([  0,   3,   5,   6,   9,  13,  14,  19,  20,  22,  24,  28,  29,
        34,  35,  37,  39,  43,  44,  49,  50,  52,  53,  55,  56,  58,
        62,  63,  68,  69,  71,  74,  79,  83,  89,  90,  92,  98, 100,
       103, 105, 109, 110, 115, 116, 118, 124, 125, 130, 131, 133, 144,
       148, 150, 151, 152, 156, 157, 162, 163, 167, 171, 172, 177, 178,
       180, 182, 186, 187, 192, 193, 195, 201, 202, 207, 208, 210, 211,
       212, 217, 219, 220, 226, 234, 235, 237, 238, 239, 242, 243, 245,
       248, 252, 253, 258, 259, 261, 267, 272, 273, 276, 282, 288, 291,
       292, 293, 297, 298, 303, 304, 307, 309, 310], dtype=int64)

In [11]:
new_data = get_class_attributes_data(10, "data/CUB_200_2011", r"data\CUB_200_2011", keep_instance_data=True)
concepts_len = len(new_data[0]["attribute_label"])
print(concepts_len)


IndexError: invalid index to scalar variable.