In [8]:
import os
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img, array_to_img

# Define source and target directories
SOURCE_DIR = "artifact_dataset/images"         # Contains subfolders: coin, sculpture, inscription
AUGMENTED_DIR = "artifact_dataset/augmented_images"  # New folder to store augmented images

# Create the target directory if it doesn't exist
if not os.path.exists(AUGMENTED_DIR):
    os.makedirs(AUGMENTED_DIR)

# Define your data augmentation parameters
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Loop over each subfolder (artifact type)
for subfolder in os.listdir(SOURCE_DIR):
    subfolder_path = os.path.join(SOURCE_DIR, subfolder)
    if os.path.isdir(subfolder_path):
        target_subfolder = os.path.join(AUGMENTED_DIR, subfolder)
        if not os.path.exists(target_subfolder):
            os.makedirs(target_subfolder)
        # Process each image in the subfolder
        for image_file in os.listdir(subfolder_path):
            image_path = os.path.join(subfolder_path, image_file)
            if os.path.isfile(image_path) and image_file.lower().endswith((".jpg", ".jpeg", ".png")):
                # Load and preprocess image
                img = load_img(image_path, target_size=(224, 224))
                x = img_to_array(img)
                x = x.reshape((1,) + x.shape)  # Shape: (1, 224, 224, 3)

                # Generate 5 augmented images for each original image
                i = 0
                for batch in datagen.flow(
                        x, batch_size=1):
                    # Create a new filename that includes the original filename
                    base, ext = os.path.splitext(image_file)
                    new_filename = f"{base}_aug_{i}{ext}"
                    new_filepath = os.path.join(target_subfolder, new_filename)
                    array_to_img(batch[0]).save(new_filepath)
                    i += 1
                    if i >= 5:
                        break

print("Data augmentation complete. Augmented images saved to:", AUGMENTED_DIR)


Data augmentation complete. Augmented images saved to: artifact_dataset/augmented_images


In [8]:
import pandas as pd
import os
import cv2
import albumentations as A
import numpy as np

# Load metadata
metadata_path = "updated_metadata_with_history.csv"
df = pd.read_csv(metadata_path)

# Function to clean the Age column
def process_age(age_str):
    """Convert 'beginning_date-end_date AD/BCE' into a single numerical value"""
    if pd.isna(age_str):
        return None  # Drop missing age values

    try:
        # Example: "500-700 AD" or "300 BCE-100 BCE"
        parts = age_str.split('-')
        start, end = parts[0].strip(), parts[1].strip()

        # Handle AD/BCE conversion
        if "BCE" in start:
            start = -int(start.replace("BCE", "").strip())
        else:
            start = int(start.replace("AD", "").strip())

        if "BCE" in end:
            end = -int(end.replace("BCE", "").strip())
        else:
            end = int(end.replace("AD", "").strip())

        return (start + end) / 2  # Midpoint as single age value
    except Exception as e:
        print(f"⚠️ Error processing age '{age_str}': {e}")
        return None

# Clean Age column
df["Age"] = df["Age"].apply(process_age)
df = df.dropna(subset=["Age"])  # Drop rows with invalid age

# Merge historical fields
df["Historical Notes"] = df[["Description", "Period", "Dynasty", "Object Date", "Culture"]].apply(
    lambda row: ', '.join([str(val) for val in row if pd.notna(val) and val != ""]), axis=1
)

# Drop original separate columns
df = df.drop(columns=["Description", "Period", "Dynasty", "Object Date", "Culture"])

# Save cleaned dataset
cleaned_dataset_path = "cleaned_dataset.csv"
df.to_csv(cleaned_dataset_path, index=False)
print(f"✅ Cleaned dataset saved as {cleaned_dataset_path}")

# ----- AUGMENTATION -----
# Define augmentations: 5 rotations (without black bars)
augmentations = [
    A.Rotate(limit=(90, 90), border_mode=cv2.BORDER_REFLECT_101, p=1),
    A.Rotate(limit=(180, 180), border_mode=cv2.BORDER_REFLECT_101, p=1),
    A.Rotate(limit=(270, 270), border_mode=cv2.BORDER_REFLECT_101, p=1),
    A.Rotate(limit=(30, 30), border_mode=cv2.BORDER_REFLECT_101, p=1),
    A.Rotate(limit=(-30, -30), border_mode=cv2.BORDER_REFLECT_101, p=1)
]

# Create output folder for augmented images
augmented_dir = "artifact_dataset/augmented_images"
os.makedirs(augmented_dir, exist_ok=True)

# Augment images using Image field directly
augmented_records = []
for idx, row in df.iterrows():
    img_path = row["Image"]  # Directly using the image path from metadata
    if not os.path.exists(img_path):
        print(f"⚠️ Missing image: {img_path}")
        continue

    # Read image
    image = cv2.imread(img_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    for i, aug in enumerate(augmentations):
        augmented = aug(image=image)["image"]
        new_filename = f"{os.path.splitext(os.path.basename(img_path))[0]}_aug{i+1}.jpg"
        new_path = os.path.join(augmented_dir, new_filename)

        # Save augmented image
        cv2.imwrite(new_path, cv2.cvtColor(augmented, cv2.COLOR_RGB2BGR))

        # Append to new dataset
        augmented_records.append({
            "Image": new_path,  # Store full path of augmented image
            "Age": row["Age"],
            "Historical Notes": row["Historical Notes"]
        })

# Convert augmented data to DataFrame
augmented_df = pd.DataFrame(augmented_records)
augmented_dataset_path = "cleaned_augmented_dataset.csv"
augmented_df.to_csv(augmented_dataset_path, index=False)

print(f"✅ Augmented dataset saved as {augmented_dataset_path}")


✅ Cleaned dataset saved as cleaned_dataset.csv
✅ Augmented dataset saved as cleaned_augmented_dataset.csv
