In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import cv2
import os
import pickle

# ---------- 1. Load and Clean Metadata ----------

# Load your cleaned augmented dataset CSV
csv_path = "cleaned_augmented_dataset.csv"
df = pd.read_csv(csv_path)

# Ensure Age is numeric
df["Age"] = pd.to_numeric(df["Age"], errors="coerce")
df = df.dropna(subset=["Age"])

# Ensure Historical Notes is a string (fill missing with empty string)
df["Historical Notes"] = df["Historical Notes"].fillna("")

# ---------- 2. Tokenize and Pad Text Data ----------

# Parameters for text processing
max_words = 5000   # Vocabulary size
max_len = 100      # Maximum sequence length

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df["Historical Notes"])
text_sequences = tokenizer.texts_to_sequences(df["Historical Notes"])
padded_sequences = pad_sequences(text_sequences, maxlen=max_len, padding="post")

# Save the tokenizer for future use
with open("text_tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
print("✅ Tokenizer saved as text_tokenizer.pkl")

# ---------- 3. Prepare Data for tf.data.Dataset ----------

# Get image paths and ages from the DataFrame
# Ensure that the "Image" column has the correct full/relative path for each image
image_paths = df["Image"].values  # e.g., "artifact_dataset/augmented_images/coin/457627_aug1.jpg"
ages = df["Age"].values.astype(np.float32)
texts = padded_sequences  # Already processed text data

# Function to load and preprocess an image
def load_and_preprocess_image(path):
    # Convert path tensor to string
    path = path.numpy().decode("utf-8")
    # Load image using OpenCV
    img = cv2.imread(path)
    if img is None:
        # Return a blank image if missing
        img = np.zeros((224, 224, 3), dtype=np.uint8)
    else:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (224, 224))
    # Normalize the image to [0,1]
    img = img.astype(np.float32) / 255.0
    return img

# Wrapper for tf.py_function
def process_sample(image_path, text_seq, age):
    # Load image using tf.py_function
    image = tf.py_function(func=load_and_preprocess_image, inp=[image_path], Tout=tf.float32)
    image.set_shape([224, 224, 3])
    return ({"image_input": image, "text_input": text_seq}, age)

# Create a tf.data.Dataset from the arrays
dataset = tf.data.Dataset.from_tensor_slices((image_paths, texts, ages))
dataset = dataset.map(lambda img_path, txt, age: process_sample(img_path, txt, age),
                      num_parallel_calls=tf.data.AUTOTUNE)
dataset = dataset.shuffle(buffer_size=len(df), seed=42)
batch_size = 32
dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

# Split dataset into training and validation (80/20 split)
total_samples = len(df)
train_size = int(0.8 * total_samples)
val_size = total_samples - train_size
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

# ---------- 4. Build the Multi-Modal Model ----------

from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Input, GlobalAveragePooling2D, Dense, Dropout, Embedding, LSTM, Concatenate
from tensorflow.keras.models import Model

# Image branch
image_input = Input(shape=(224, 224, 3), name="image_input")
# Using ResNet50 as base; adjust weights, freeze layers as needed
base_model = ResNet50(weights="imagenet", include_top=False, input_tensor=image_input)
x = GlobalAveragePooling2D()(base_model.output)
x = Dropout(0.5)(x)
image_features = Dense(128, activation="relu")(x)

# Text branch
text_input = Input(shape=(max_len,), name="text_input")
x_text = Embedding(input_dim=max_words, output_dim=50, input_length=max_len)(text_input)
x_text = LSTM(64)(x_text)
text_features = Dense(64, activation="relu")(x_text)

# Fusion of image and text features
combined = Concatenate()([image_features, text_features])
x = Dense(64, activation="relu")(combined)
output = Dense(1, activation="linear", name="age_output")(x)

multi_modal_model = Model(inputs=[image_input, text_input], outputs=output)
multi_modal_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                            loss="mean_squared_error",
                            metrics=["mae"])
multi_modal_model.summary()

# ---------- 5. Train the Model ----------
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

callbacks = [
    EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True),
    ModelCheckpoint("final_multimodal_model.keras", monitor="val_loss", save_best_only=True)
]

history = multi_modal_model.fit(train_dataset, validation_data=val_dataset, epochs=10, callbacks=callbacks)

# Save the final model
multi_modal_model.save("final_multimodal_model.keras")
print("✅ Training complete. Final multi-modal model saved as final_multimodal_model.keras")


✅ Tokenizer saved as text_tokenizer.pkl




Epoch 1/10
[1m  3/145[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m17:00[0m 7s/step - loss: 937348.8125 - mae: 939.9341   