In [6]:
import os
import pandas as pd
import re

# ----------------------------
# Define paths
# ----------------------------
ORIG_CSV_PATH = 'artifact_dataset/metadata.csv'
ORIG_IMAGES_DIR = os.path.join('artifact_dataset', 'images')
AUG_IMAGES_DIR = os.path.join('artifact_dataset', 'augmented_images')

# ----------------------------
# 1. Load and Clean Original Metadata
# ----------------------------
df = pd.read_csv(ORIG_CSV_PATH)

# Ensure columns are treated as strings and strip whitespace
df['ArtifactType'] = df['ArtifactType'].astype(str).str.strip()
df['Name'] = df['Name'].astype(str).str.strip()
df['Age'] = df['Age'].astype(str).str.strip()
df['Image'] = df['Image'].astype(str).str.strip()

# ----------------------------
# 2. Parse Age Values
# ----------------------------
def parse_age(age_str):
    """
    A simple function to parse age values.
    It handles ranges like "755-815 AD" by averaging the two numbers,
    or a single year like "500 AD" by returning that value.
    """
    if isinstance(age_str, str):
        # Try to match a range: e.g., "755-815 AD"
        match_range = re.match(r'(\d+)-(\d+)', age_str)
        if match_range:
            start, end = match_range.groups()
            return (int(start) + int(end)) / 2.0
        # Try to match a single number: e.g., "500 AD"
        match_single = re.match(r'(\d+)', age_str)
        if match_single:
            return float(match_single.group(1))
    return None

# Create a new column for cleaned age values
df['Age_cleaned'] = df['Age'].apply(parse_age)
# Filter out rows with invalid age values
df = df[df['Age_cleaned'].notna()].copy()

# ----------------------------
# 3. Prepare Image Paths
# ----------------------------
# (Assume the original CSV's "Image" column contains relative paths like "coin/457172.jpg")
# If needed, adjust them here. In this example, we'll leave them as-is.

# ----------------------------
# 4. Append Augmented Image Rows
# ----------------------------
aug_rows = []
# Loop through each row in the cleaned original DataFrame
for idx, row in df.iterrows():
    artifact_type = row['ArtifactType']
    orig_image_path = row['Image']  # e.g., "coin/457172.jpg"
    # Extract base filename (e.g., "457172") and file extension (e.g., ".jpg")
    base_filename = os.path.splitext(os.path.basename(orig_image_path))[0]
    ext = os.path.splitext(os.path.basename(orig_image_path))[1]
    
    # Construct the augmented images folder path for this artifact type
    aug_folder = os.path.join(AUG_IMAGES_DIR, artifact_type)
    if os.path.isdir(aug_folder):
        # For each file in the augmented folder that starts with the base filename
        for file in os.listdir(aug_folder):
            if file.startswith(f"{base_filename}_aug_") and file.lower().endswith((".jpg", ".jpeg", ".png")):
                # Build the relative path for the augmented image
                aug_image_path = os.path.join('artifact_dataset', 'augmented_images', artifact_type, file)
                # Create a new row: copy the original row but update the "Image" field
                new_row = row.copy()
                new_row['Image'] = aug_image_path
                aug_rows.append(new_row)

# Create a DataFrame from the augmented rows (if any)
aug_df = pd.DataFrame(aug_rows)

# ----------------------------
# 5. Combine Original and Augmented Data
# ----------------------------
combined_df = pd.concat([df, aug_df], ignore_index=True)

# For consistency, drop the old "Age" column and rename "Age_cleaned" to "Age"
combined_df.drop(columns=['Age'], inplace=True)
combined_df.rename(columns={'Age_cleaned': 'Age'}, inplace=True)

# Optionally, shuffle the combined DataFrame
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# ----------------------------
# 6. Save the Combined (Cleaned & Augmented) CSV
# ----------------------------
OUTPUT_CSV = 'cleaned_augmented_dataset.csv'
combined_df.to_csv(OUTPUT_CSV, index=False)

print(f"Updated metadata CSV created as '{OUTPUT_CSV}'.")
print(f"Total rows (original + augmented): {len(combined_df)}")


Updated metadata CSV created as 'cleaned_augmented_dataset.csv'.
Total rows (original + augmented): 5556
