In [1]:
import pandas as pd
import numpy as np

In [2]:



def pre_process_data(df):
    # Calculate the number of images per person for each dataset
    images_per_person_l = df.groupby('id')['image_name'].count()

    # Define bins and labels
    bins = [26, 29, 30, 35]  # Ensure bins are strictly increasing
    labels = ['27-29', '30', '31-35']  # Labels for the bins

    # Bin the data and count occurrences
    l_counts = pd.cut(images_per_person_l, bins=bins, labels=labels).value_counts(sort=False)

    # Combine the results into a single table
    comparison_table = pd.DataFrame({
        'Range of Images (per person)': labels,
        'Number of Persons': l_counts.values,
    })




In [3]:
def load_data():
    # Load data
    df_s = pd.read_csv('celeba_buffalo_s_reworked.csv')
    df_l = pd.read_csv('celeba_buffalo_l_reworked.csv')
    return df_s, df_l
    
df_s, df_l= load_data()

embedding_columns = ["embedding_"+str(i) for i in range(512)]
id_columns = ['id']
image_name_columns= ['image_name']
labels_columns = ['5_o_Clock_Shadow', 'Arched_Eyebrows',
       'Bags_Under_Eyes', 'Bald', 'Bangs', 'Big_Lips', 'Big_Nose',
       'Black_Hair', 'Blond_Hair', 'Blurry', 'Brown_Hair', 'Bushy_Eyebrows',
       'Chubby', 'Double_Chin', 'Eyeglasses', 'Goatee', 'Gray_Hair',
       'Heavy_Makeup', 'High_Cheekbones', 'Male', 'Mouth_Slightly_Open',
       'Mustache', 'Narrow_Eyes', 'No_Beard', 'Oval_Face', 'Pale_Skin',
       'Pointy_Nose', 'Receding_Hairline', 'Rosy_Cheeks', 'Sideburns',
       'Smiling', 'Straight_Hair', 'Wavy_Hair', 'Wearing_Earrings',
       'Wearing_Hat', 'Wearing_Lipstick', 'Wearing_Necklace',
       'Wearing_Necktie', 'Young']



In [4]:
# Retain only ID and label columns
df_s = df_s[['id'] + image_name_columns+labels_columns]
df_l = df_l[['id'] + image_name_columns+labels_columns]

# Merge datasets on 'id'
merged_df = pd.merge(df_s, df_l, on='id', suffixes=('_s', '_l'))

# Check for inconsistencies
inconsistent_ids = []
for label in labels_columns:
    mismatches = np.sum(df_s[f'{label}']  != df_l[f'{label}'])
    if mismatches:
        print(mismatches)
        print(label)
        break
        inconsistent_ids.append((label, mismatches['id'].tolist()))

# Print results
if inconsistent_ids:
    print("Inconsistencies found in the following labels:")
    for label, ids in inconsistent_ids:
        print(f" - {label}: {len(ids)} IDs with mismatches (IDs: {ids[:5]}{'...' if len(ids) > 5 else ''})")
else:
    print("All labels are consistent across the datasets for the same ID.")



All labels are consistent across the datasets for the same ID.


In [5]:


# Check for inconsistencies
inconsistent_ids_and_labels = {}

# Loop over each label to compare values
for label in labels_columns:
    # Compare values for the current label between both datasets
    for id_value in df_s['id'].unique():
        # Check if values are inconsistent for this id and label
        value_s = df_s[df_s['id'] == id_value][f'{label}']
        if len(np.unique(value_s))>1:

            try:
                inconsistent_ids_and_labels[label].append(id_value)
            except:
                inconsistent_ids_and_labels[label] = [id_value]





In [6]:
for i in labels_columns:
    if i not in  inconsistent_ids_and_labels.keys():
        print(i)
inconsistent_ids_and_labels['Male']


[1261,
 1264,
 1270,
 1318,
 1326,
 1335,
 1341,
 1382,
 1410,
 1455,
 1496,
 1569,
 1591,
 1607,
 1631,
 1639,
 1649,
 1725,
 1744,
 1805,
 1807,
 1809,
 1912,
 1946,
 2168,
 2316,
 2337,
 2451,
 3994,
 4017,
 4028,
 4138,
 4139,
 4151,
 4165,
 4204,
 4242,
 4256,
 4263,
 4290,
 4321,
 4326,
 4462,
 4493,
 4514,
 4539,
 4573,
 4593,
 4602,
 4637,
 4652,
 4715,
 5084,
 5125,
 5133,
 5170,
 5178,
 5224,
 5225,
 5238,
 5243,
 5346,
 5370,
 5447,
 5459,
 5488,
 5530,
 5543,
 5566,
 5569,
 5635,
 5696,
 5748,
 5805,
 5815,
 5921,
 5928,
 5951,
 6043,
 6046,
 6080,
 6112,
 6137,
 6142,
 6147,
 6163,
 6177,
 6181,
 6183,
 6225,
 6236,
 6241,
 6262,
 6286,
 6288,
 6296,
 6305,
 6311,
 6316,
 6318,
 6330,
 6335,
 6339,
 6367,
 6387,
 9152,
 10082,
 10090,
 10111,
 10173]

In [7]:
df_s[df_s['id'] == 10082][['id','image_name','Male']]

Unnamed: 0,id,image_name,Male
29147,10082,001365.jpg,-1
29148,10082,001646.jpg,1
29149,10082,004061.jpg,-1
29150,10082,017877.jpg,-1
29151,10082,017931.jpg,-1
29152,10082,018633.jpg,-1
29153,10082,025682.jpg,-1
29154,10082,032140.jpg,-1
29155,10082,035503.jpg,-1
29156,10082,043902.jpg,-1


In [8]:
# Print results
if inconsistent_ids_and_labels:
    print("Inconsistencies found:")
    print([k for k in inconsistent_ids_and_labels.keys()])

else:
    print("All labels are consistent across the datasets for the same ID.")



Inconsistencies found:
['5_o_Clock_Shadow', 'Arched_Eyebrows', 'Bags_Under_Eyes', 'Bald', 'Bangs', 'Big_Lips', 'Big_Nose', 'Black_Hair', 'Blond_Hair', 'Blurry', 'Brown_Hair', 'Bushy_Eyebrows', 'Chubby', 'Double_Chin', 'Eyeglasses', 'Goatee', 'Gray_Hair', 'Heavy_Makeup', 'High_Cheekbones', 'Male', 'Mouth_Slightly_Open', 'Mustache', 'Narrow_Eyes', 'No_Beard', 'Oval_Face', 'Pale_Skin', 'Pointy_Nose', 'Receding_Hairline', 'Rosy_Cheeks', 'Sideburns', 'Smiling', 'Straight_Hair', 'Wavy_Hair', 'Wearing_Earrings', 'Wearing_Hat', 'Wearing_Lipstick', 'Wearing_Necklace', 'Wearing_Necktie', 'Young']


In [9]:
df_l.head()

Unnamed: 0,id,image_name,5_o_Clock_Shadow,Arched_Eyebrows,Bags_Under_Eyes,Bald,Bangs,Big_Lips,Big_Nose,Black_Hair,...,Sideburns,Smiling,Straight_Hair,Wavy_Hair,Wearing_Earrings,Wearing_Hat,Wearing_Lipstick,Wearing_Necklace,Wearing_Necktie,Young
0,15,011256.jpg,1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,1,-1,-1,-1,1
1,15,013138.jpg,-1,-1,-1,-1,-1,1,-1,1,...,-1,-1,1,-1,-1,-1,-1,-1,-1,1
2,15,018122.jpg,1,1,-1,-1,-1,-1,-1,1,...,1,-1,1,-1,-1,-1,-1,-1,-1,1
3,15,026437.jpg,1,1,1,-1,-1,-1,-1,1,...,1,-1,1,-1,-1,-1,-1,-1,-1,1
4,15,029700.jpg,1,-1,-1,-1,-1,-1,-1,1,...,1,-1,1,-1,-1,-1,-1,-1,1,1


In [10]:

# Compare labels for each column
inconsistent_labels = {}
for label in labels_columns:
    # Check where the values differ for the same `id`
    mismatches = merged[merged[f'{label}_s'] != merged[f'{label}_l']]
    if not mismatches.empty:
        inconsistent_labels[label] = len(mismatches)

# Output results
if inconsistent_labels:
    print("Inconsistencies found in the following labels:")
    for label, count in inconsistent_labels.items():
        print(f"- {label}: {count} mismatches")
else:
    print("All labels are consistent across the datasets for matching IDs.")


NameError: name 'merged' is not defined