In [2]:
import numpy as np
import pandas as pd

attributes = np.load("../aml-2025-feathers-in-focus/attributes.npy", allow_pickle=True)
print(attributes)
attr_df = pd.DataFrame(attributes)
attr_df

#sum all the values in each column
attr_sums = attr_df.sum(axis=0)
print(attr_sums)

[[0.0106384  0.0106384  0.00709227 ... 0.00918617 0.02526198 0.02066889]
 [0.         0.01133243 0.00944369 ... 0.00266542 0.02132333 0.05863916]
 [0.         0.         0.00742474 ... 0.         0.00885258 0.01770516]
 ...
 [0.         0.00334966 0.         ... 0.00556558 0.         0.15027069]
 [0.         0.11184146 0.         ... 0.08207164 0.05836206 0.01823814]
 [0.04378019 0.02814441 0.         ... 0.06022509 0.07695428 0.06189801]]
0       1.105248
1       6.394052
2       0.793697
3       1.312837
4       3.273524
         ...    
307     6.398792
308    16.072625
309     4.119036
310    12.399692
311    16.278639
Length: 312, dtype: float64


In [3]:
# formatting bird classes properly
import numpy as np
bird_classes = np.load("../aml-2025-feathers-in-focus/class_names.npy", allow_pickle=True).item()

def split_lower(np_string_key: np.str_) -> str:
    s = str(np_string_key)
    parts = s.split('.')
    if len(parts) > 1:
        # If there's a dot, return the part after it, lowercased
        return parts[1].lower()
    else:
        # If no dot, return the entire string, lowercased
        return s.lower()

def clean_bird_classes():
    # Convert np.str_ to regular str, split, take the name part, and lowercase

    # Create the new dictionary using a dictionary comprehension
    # Iterate over the original items (key, value) from bird_classes
    # Apply split_lower to the key (np.str_) to get the cleaned, lowercased string
    # Use the cleaned string as the new key and the original integer ID as the new value
    cleaned_birds_dict = {
        split_lower(key): value
        for key, value in bird_classes.items()
    }

    return cleaned_birds_dict
print(clean_bird_classes())

clean_birds_dict = clean_bird_classes()



{'black_footed_albatross': 1, 'laysan_albatross': 2, 'sooty_albatross': 3, 'groove_billed_ani': 4, 'crested_auklet': 5, 'least_auklet': 6, 'parakeet_auklet': 7, 'rhinoceros_auklet': 8, 'brewer_blackbird': 9, 'red_winged_blackbird': 10, 'rusty_blackbird': 11, 'yellow_headed_blackbird': 12, 'bobolink': 13, 'indigo_bunting': 14, 'lazuli_bunting': 15, 'painted_bunting': 16, 'cardinal': 17, 'spotted_catbird': 18, 'gray_catbird': 19, 'yellow_breasted_chat': 20, 'eastern_towhee': 21, 'chuck_will_widow': 22, 'brandt_cormorant': 23, 'red_faced_cormorant': 24, 'pelagic_cormorant': 25, 'bronzed_cowbird': 26, 'shiny_cowbird': 27, 'brown_creeper': 28, 'american_crow': 29, 'fish_crow': 30, 'black_billed_cuckoo': 31, 'mangrove_cuckoo': 32, 'yellow_billed_cuckoo': 33, 'gray_crowned_rosy_finch': 34, 'purple_finch': 35, 'northern_flicker': 36, 'acadian_flycatcher': 37, 'great_crested_flycatcher': 38, 'least_flycatcher': 39, 'olive_sided_flycatcher': 40, 'scissor_tailed_flycatcher': 41, 'vermilion_flycat

In [3]:
#Check if each class of bird has unique attributes
import numpy as np
import pandas as pd

# Load attribute definitions
attr_map = {}
group_map = {}  # group_name -> list of IDs

with open("../aml-2025-feathers-in-focus/attributes.txt", "r") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        
        #e.g: attr_id_str   = 15
        #e.g: attr_full     = has_wing_color::black
        attr_id_str, attr_full = line.split(maxsplit=1) 

        attr_id = int(attr_id_str)
        attr_map[attr_id] = attr_full

        group = attr_full.split("::")[0]  # e.g. "has_beak_color"

        if group not in group_map:
            group_map[group] = []
        group_map[group].append(attr_id)


In [4]:
# Top-2 binarization of attributes
# Convert continuous attributes to binary by keeping only the top 2 highest values per attribute group (e.g., bill shape, color) as 1s, setting others to 0 for each bird species

def top2binarization() -> np.ndarray:
    attributes = np.load("../aml-2025-feathers-in-focus/attributes.npy", allow_pickle=True)
    attributes_df = pd.DataFrame(attributes, columns=list(attr_map.keys()))
    attributes_df_binarytop2 = attributes_df.copy()

    for group, attr_ids in group_map.items():
        cols = [col for col in attr_ids if col in attributes_df_binarytop2.columns]

        # skip missing columns
        if len(cols) == 0:
            continue

        # Zero out the group first
        attributes_df_binarytop2[cols] = 0

        # For each row, get the indices of the top 2 values
        top2_idx = np.argsort(-attributes_df[cols].values, axis=1)[:, :2]  # descending order

        # find the index of the max value for each bird
        max_idx = attributes_df[cols].idxmax(axis=1)

        # Set the top 2 to 1
        for row_idx, col_positions in enumerate(top2_idx):
            attributes_df_binarytop2.iloc[row_idx, [attributes_df_binarytop2.columns.get_loc(cols[i]) for i in col_positions]] = 1


    attributestop2_df_named = attributes_df_binarytop2.rename(columns=attr_map)
    #set indeces to start from 1 for joining with bird classes later
    attributestop2_df_named.index = attributestop2_df_named.index + 1
    attributestop2_df_named
    return attributestop2_df_named 

    # attributestop2_df_named.sum(axis=0)
    
top2binarization_dataframe = top2binarization()
print(top2binarization_dataframe)

     has_bill_shape::curved_(up_or_down)  has_bill_shape::dagger  \
1                                      0                       0   
2                                      0                       0   
3                                      0                       0   
4                                      0                       0   
5                                      0                       0   
..                                   ...                     ...   
196                                    0                       1   
197                                    0                       0   
198                                    0                       0   
199                                    0                       1   
200                                    1                       0   

     has_bill_shape::hooked  has_bill_shape::needle  \
1                         0                       0   
2                         0                       0   
3                 

In [None]:
# Binarization of attributes with only highest value per group set to 1 (if multiple max values, set all to 1)

def top1multibinarization() -> np.ndarray:
    attributes = np.load("../aml-2025-feathers-in-focus/attributes.npy", allow_pickle=True)
    attributes_df = pd.DataFrame(attributes, columns=list(attr_map.keys()))
    attributes_df_binarytop1 = attributes_df.copy()

    for group, attr_ids in group_map.items():
        cols = [col for col in attr_ids if col in attributes_df_binarytop1.columns]

        if len(cols) == 0:
            continue

        # Zero out the group
        attributes_df_binarytop1[cols] = 0

        # Extract values for this group
        group_values = attributes_df[cols].values

        # Compute per-row max for this group
        row_max = np.max(group_values, axis=1)  # shape: (n_rows,)

        # Create a boolean mask: True where value == row max
        mask = (group_values == row_max[:, None])  # shape: (n_rows, n_cols_in_group)

        # Set 1 where mask is True
        attributes_df_binarytop1.loc[:, cols] = mask.astype(int)

    attributeshighv_df_named = attributes_df_binarytop1.rename(columns=attr_map)
    attributeshighv_df_named.index = attributeshighv_df_named.index + 1
    return attributeshighv_df_named

top1multibinarization_dataframe = top1multibinarization()
top1multibinarization_dataframe

Unnamed: 0,has_bill_shape::curved_(up_or_down),has_bill_shape::dagger,has_bill_shape::hooked,has_bill_shape::needle,has_bill_shape::hooked_seabird,has_bill_shape::spatulate,has_bill_shape::all-purpose,has_bill_shape::cone,has_bill_shape::specialized,has_wing_color::blue,...,has_crown_color::pink,has_crown_color::orange,has_crown_color::black,has_crown_color::white,has_crown_color::red,has_crown_color::buff,has_wing_pattern::solid,has_wing_pattern::spotted,has_wing_pattern::striped,has_wing_pattern::multi-colored
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
5,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
197,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
198,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
199,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0


In [18]:
def create_regular_bird_attributes_dataframe() -> np.ndarray:
    """
    Create a properly formatted DataFrame for bird attributes with named columns. 
    Index: class label (integer), Columns: characteristics (e.g: has_bill_shape:dagger)
    
    
    Returns:
        pd.DataFrame: Clean DataFrame with named columns
    """

    baseattributes_df = attr_df.copy()
    baseattributes_df.index = baseattributes_df.index 

    # complete_bird_attributes = {}

    # for key, value in clean_birds_dict.items():
    #     complete_bird_attributes[key] = attr_df.loc[value]
        
    # complete_bird_attributes = pd.DataFrame.from_dict(complete_bird_attributes, orient="index")
    # complete_bird_attributes.head()

    #export to csv
    # complete_bird_attributes.to_csv("complete_bird_attributes.csv")

    baseattributes_df = pd.DataFrame(attributes, columns=list(attr_map.keys()))
    baseattributes_df_named = baseattributes_df.rename(columns=attr_map)
    baseattributes_df_named.index = baseattributes_df_named.index + 1
    baseattributes_df_named.head()
    
    return baseattributes_df_named

bird_attributes_df = create_regular_bird_attributes_dataframe()
bird_attributes_df
# bird_attributes_df.sum(axis=1)

Unnamed: 0,has_bill_shape::curved_(up_or_down),has_bill_shape::dagger,has_bill_shape::hooked,has_bill_shape::needle,has_bill_shape::hooked_seabird,has_bill_shape::spatulate,has_bill_shape::all-purpose,has_bill_shape::cone,has_bill_shape::specialized,has_wing_color::blue,...,has_crown_color::pink,has_crown_color::orange,has_crown_color::black,has_crown_color::white,has_crown_color::red,has_crown_color::buff,has_wing_pattern::solid,has_wing_pattern::spotted,has_wing_pattern::striped,has_wing_pattern::multi-colored
1,0.010638,0.010638,0.007092,0.003546,0.138299,0.065603,0.000000,0.005319,0.000000,0.000000,...,0.000000,0.005439,0.005439,0.228446,0.000000,0.000000,0.186020,0.009186,0.025262,0.020669
2,0.000000,0.011332,0.009444,0.000000,0.202095,0.041552,0.015110,0.005666,0.000000,0.000000,...,0.006291,0.000000,0.111144,0.008388,0.000000,0.046135,0.202572,0.002665,0.021323,0.058639
3,0.000000,0.000000,0.007425,0.000000,0.002475,0.000000,0.000000,0.074247,0.146020,0.000000,...,0.000000,0.000000,0.190411,0.012555,0.000000,0.010462,0.203609,0.000000,0.008853,0.017705
4,0.000000,0.000000,0.003861,0.000000,0.003861,0.013514,0.005792,0.073360,0.138998,0.004127,...,0.004885,0.000000,0.190531,0.000000,0.000000,0.000000,0.152750,0.006840,0.036478,0.043317
5,0.000000,0.035088,0.000000,0.000000,0.000000,0.000000,0.102458,0.070177,0.000000,0.000000,...,0.000000,0.000000,0.204036,0.002458,0.002458,0.000000,0.031640,0.002751,0.015132,0.158200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,0.000000,0.020881,0.000000,0.000000,0.000000,0.000000,0.208805,0.006425,0.000000,0.000000,...,0.000000,0.000000,0.041319,0.013281,0.000000,0.007378,0.222993,0.000000,0.002915,0.010202
197,0.003381,0.000000,0.003381,0.000000,0.008451,0.000000,0.098037,0.113250,0.005071,0.000000,...,0.003194,0.025553,0.009582,0.011179,0.051105,0.091031,0.036022,0.003431,0.039453,0.152665
198,0.000000,0.003350,0.000000,0.000000,0.003350,0.000000,0.093790,0.139011,0.001675,0.005359,...,0.000000,0.039706,0.000000,0.002941,0.008823,0.130882,0.085339,0.005566,0.000000,0.150271
199,0.000000,0.111841,0.000000,0.000000,0.000000,0.000000,0.093201,0.010167,0.000000,0.000000,...,0.000000,0.012767,0.173262,0.045595,0.000000,0.000000,0.056538,0.082072,0.058362,0.018238


In [8]:
attr_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,302,303,304,305,306,307,308,309,310,311
0,0.010638,0.010638,0.007092,0.003546,0.138299,0.065603,0.000000,0.005319,0.000000,0.000000,...,0.000000,0.005439,0.005439,0.228446,0.000000,0.000000,0.186020,0.009186,0.025262,0.020669
1,0.000000,0.011332,0.009444,0.000000,0.202095,0.041552,0.015110,0.005666,0.000000,0.000000,...,0.006291,0.000000,0.111144,0.008388,0.000000,0.046135,0.202572,0.002665,0.021323,0.058639
2,0.000000,0.000000,0.007425,0.000000,0.002475,0.000000,0.000000,0.074247,0.146020,0.000000,...,0.000000,0.000000,0.190411,0.012555,0.000000,0.010462,0.203609,0.000000,0.008853,0.017705
3,0.000000,0.000000,0.003861,0.000000,0.003861,0.013514,0.005792,0.073360,0.138998,0.004127,...,0.004885,0.000000,0.190531,0.000000,0.000000,0.000000,0.152750,0.006840,0.036478,0.043317
4,0.000000,0.035088,0.000000,0.000000,0.000000,0.000000,0.102458,0.070177,0.000000,0.000000,...,0.000000,0.000000,0.204036,0.002458,0.002458,0.000000,0.031640,0.002751,0.015132,0.158200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.000000,0.020881,0.000000,0.000000,0.000000,0.000000,0.208805,0.006425,0.000000,0.000000,...,0.000000,0.000000,0.041319,0.013281,0.000000,0.007378,0.222993,0.000000,0.002915,0.010202
196,0.003381,0.000000,0.003381,0.000000,0.008451,0.000000,0.098037,0.113250,0.005071,0.000000,...,0.003194,0.025553,0.009582,0.011179,0.051105,0.091031,0.036022,0.003431,0.039453,0.152665
197,0.000000,0.003350,0.000000,0.000000,0.003350,0.000000,0.093790,0.139011,0.001675,0.005359,...,0.000000,0.039706,0.000000,0.002941,0.008823,0.130882,0.085339,0.005566,0.000000,0.150271
198,0.000000,0.111841,0.000000,0.000000,0.000000,0.000000,0.093201,0.010167,0.000000,0.000000,...,0.000000,0.012767,0.173262,0.045595,0.000000,0.000000,0.056538,0.082072,0.058362,0.018238


In [None]:
def create_normalized_bird_attributes_dataframe() -> np.ndarray:
    baseattributes_df = attr_df.copy()
    baseattributes_df.index = baseattributes_df.index
    
    # Normalize each row by its maximum value
    def normalize_row(row):
        max_val = row.max()
        if max_val > 0:  # Avoid division by zero
            return row / max_val
        else:
            return row
    
    normalized_df = baseattributes_df.apply(normalize_row, axis=1)
    return normalized_df
print(create_normalized_bird_attributes_dataframe().max(axis=1))

0      1.0
1      1.0
2      1.0
3      1.0
4      1.0
      ... 
195    1.0
196    1.0
197    1.0
198    1.0
199    1.0
Length: 200, dtype: float64


In [22]:
#Choose and save the type of Dataset you want to "complete_bird_attributes.csv" csv


# Add the names of the birds to the attributes dataframe
# complete_bird_attributes = {}

# for key, value in clean_birds_dict.items():
#     complete_bird_attributes[key] = attributes_df_named.loc[value]
    
# complete_bird_attributes = pd.DataFrame.from_dict(complete_bird_attributes, orient="index")
# complete_bird_attributes.head()


complete_bird_attributes = create_normalized_bird_attributes_dataframe() #choose type of binarization function (from above)
# replace index with bird names
complete_bird_attributes = complete_bird_attributes.reset_index()
complete_bird_attributes.rename(columns={'index': 'class_key'}, inplace=True)
#drop index column then make class_key the index
complete_bird_attributes.set_index('class_key', inplace=True)


complete_bird_attributes.head(5).sum(axis=1)
len(complete_bird_attributes.columns)

#export to csv
complete_bird_attributes.to_csv("complete_bird_attributes.csv")