### Recognition Model: Data Cleaning Portion

In [28]:
import pandas as pd

# Open and read CSV, then close immediately
with open("celeba_features.csv", "r") as f:
    df = pd.read_csv(f)


def get_distrib(df):
    # Print counts of each column except 'caption'
    for column in df.columns:
        if column not in ["caption", "file_name"]:
            print(f"\n=== {column} ===")
            print(df[column].value_counts())


In [29]:
# Columns to adjust
target_columns = ["eye_color", "hair_color", "eyebrow_color", "skin_tone"]

for col in target_columns:
    if "brown" in df[col].values:
        counts = df[col].value_counts()

        if "brown" in counts.index and len(counts) > 1:
            second_largest = counts.iloc[1]  # 2nd-largest value
            brown_rows = df[df[col] == "brown"]

            if len(brown_rows) > second_largest:
                # keep only the needed number of 'brown'
                brown_keep = brown_rows.sample(n=second_largest, random_state=42)
                df = pd.concat([df[df[col] != "brown"], brown_keep])

# Reset index
df = df.reset_index(drop=True)


In [30]:
# Mapping for binning skin tones
skin_tone_map = {
    "fair": "fair",
    "pale": "fair",
    
    "light": "light",
    "light brown": "light",
    
    "medium brown": "medium",
    "olive": "medium",
    
    "dark": "dark",
    "brown": "dark",
    "dark brown": "dark",
    "tan": "dark",
}

# Apply mapping
df["skin_tone"] = df["skin_tone"].map(skin_tone_map)

# Drop rows where skin_tone is now NaN (the ones we marked REMOVE)
df = df.dropna(subset=["skin_tone"])

# Reset index
df = df.reset_index(drop=True)

In [31]:
# Current counts
counts = df["skin_tone"].value_counts()
print("Before:\n", counts)

# Find target size (largest minority group)
target_size = counts.drop("fair").max()

# Downsample 'fair' to match target_size
fair_rows = df[df["skin_tone"] == "fair"]
fair_sampled = fair_rows.sample(n=target_size, random_state=42)

# Keep all other rows + balanced fair
df_balanced = pd.concat([df[df["skin_tone"] != "fair"], fair_sampled])

# Shuffle and reset index
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print("\nAfter:\n", df_balanced["skin_tone"].value_counts())


Before:
 skin_tone
fair      6867
dark       621
medium     611
light      450
Name: count, dtype: int64

After:
 skin_tone
dark      621
fair      621
medium    611
light     450
Name: count, dtype: int64


In [32]:
import pandas as pd

# Assuming df_balanced already exists

cols_to_filter = ["eye_color", "hair_color", "eyebrow_color"]

# 1) Keep only categories with >= 15 in each column
for col in cols_to_filter:
    counts = df_balanced[col].value_counts()
    valid_values = counts[counts >= 15].index
    df_balanced = df_balanced[df_balanced[col].isin(valid_values)]

# 2) Merge spellings and tiny buckets
for col in cols_to_filter:
    df_balanced[col] = df_balanced[col].replace("grey", "gray")

# eye_color: merge red -> brown
df_balanced["eye_color"] = df_balanced["eye_color"].replace("red", "brown")

# hair_color: fix/normalize
df_balanced["hair_color"] = df_balanced["hair_color"].replace({
    "brown and blonde": "brown_blonde",
    "blond": "blonde"
})

# Reset index
df_balanced = df_balanced.reset_index(drop=True)

# Show new counts
for col in cols_to_filter:
    print(f"\n=== {col} ===")
    print(df_balanced[col].value_counts())



=== eye_color ===
eye_color
brown         882
dark brown    633
blue          412
dark          168
gray           66
green          37
Name: count, dtype: int64

=== hair_color ===
hair_color
dark brown     817
black          528
blonde         352
brown          278
gray           108
dark            80
light brown     21
red             14
Name: count, dtype: int64

=== eyebrow_color ===
eyebrow_color
dark brown    1091
brown          416
blonde         310
black          183
gray           105
dark            93
Name: count, dtype: int64


In [33]:
get_distrib(df_balanced)


=== facing_direction ===
facing_direction
front    2144
side       54
Name: count, dtype: int64

=== eye_color ===
eye_color
brown         882
dark brown    633
blue          412
dark          168
gray           66
green          37
Name: count, dtype: int64

=== hair_color ===
hair_color
dark brown     817
black          528
blonde         352
brown          278
gray           108
dark            80
light brown     21
red             14
Name: count, dtype: int64

=== eyebrow_color ===
eyebrow_color
dark brown    1091
brown          416
blonde         310
black          183
gray           105
dark            93
Name: count, dtype: int64

=== skin_tone ===
skin_tone
dark      610
medium    601
fair      565
light     422
Name: count, dtype: int64


In [34]:
# Save to CSV
df_balanced.to_csv("celeba_features_final.csv", index=False)

print("✅ df_balanced saved to celeba_features_final.csv")


✅ df_balanced saved to celeba_features_final.csv
