In [4]:
import os
import pandas as pd

# Path to the original metadata CSV
ORIG_CSV_PATH = 'artifact_dataset/metadata.csv'

# Define folder paths for original and augmented images
ORIG_IMAGES_DIR = os.path.join('artifact_dataset', 'images')
AUG_IMAGES_DIR = os.path.join('artifact_dataset', 'augmented_images')

# Load the original metadata
orig_df = pd.read_csv(ORIG_CSV_PATH)

# Clean up whitespace in key columns
orig_df['ArtifactType'] = orig_df['ArtifactType'].str.strip()
orig_df['Name'] = orig_df['Name'].str.strip()
orig_df['Age'] = orig_df['Age'].str.strip()
orig_df['Image'] = orig_df['Image'].str.strip()

# Create a list to store new rows for augmented images
aug_rows = []

# Loop over each row in the original metadata
for idx, row in orig_df.iterrows():
    artifact_type = row['ArtifactType']
    orig_image_path = row['Image']  # Example: "coin/457172.jpg" or full relative path if provided
    
    # Extract the base filename and extension
    base_filename = os.path.splitext(os.path.basename(orig_image_path))[0]
    ext = os.path.splitext(os.path.basename(orig_image_path))[1]
    
    # Determine the augmented folder for this artifact type
    aug_folder = os.path.join(AUG_IMAGES_DIR, artifact_type)
    
    if os.path.isdir(aug_folder):
        # List files in the augmented folder that start with the base filename
        for file in os.listdir(aug_folder):
            if file.startswith(f"{base_filename}_aug_") and file.lower().endswith((".jpg", ".jpeg", ".png")):
                # Construct full relative path for the augmented image
                aug_image_path = os.path.join('artifact_dataset', 'augmented_images', artifact_type, file)
                # Create a new row that is a copy of the original row but with the augmented image path
                new_row = row.copy()
                new_row['Image'] = aug_image_path
                aug_rows.append(new_row)

# Create a DataFrame from the augmented rows (if any)
aug_df = pd.DataFrame(aug_rows)

# Combine the original metadata with the augmented metadata
combined_df = pd.concat([orig_df, aug_df], ignore_index=True)

# Optionally, shuffle the combined DataFrame
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the new metadata to a CSV file with the updated name
OUTPUT_CSV = 'cleaned_augmented_dataset.csv'
combined_df.to_csv(OUTPUT_CSV, index=False)

print(f"Updated metadata CSV created as '{OUTPUT_CSV}'.")
print(f"Total rows (original + augmented): {len(combined_df)}")


Updated metadata CSV created as 'cleaned_augmented_dataset.csv'.
Total rows (original + augmented): 5556
