In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.applications import ResNet50
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D
from keras.preprocessing.image import ImageDataGenerator
from keras.utils import to_categorical
import pickle

# Load the data
data_path = Path('.')
train_df = pd.read_csv(data_path / 'train.csv')

# Create a dataframe with unique image paths and their metadata
metadata_df = train_df[['image_path', 'State', 'Species']].drop_duplicates().reset_index(drop=True)

# Encode the target labels
state_encoder = LabelEncoder()
species_encoder = LabelEncoder()
metadata_df['state_encoded'] = state_encoder.fit_transform(metadata_df['State'])
metadata_df['species_encoded'] = species_encoder.fit_transform(metadata_df['Species'])

# Convert encoded labels to categorical
num_states = len(state_encoder.classes_)
num_species = len(species_encoder.classes_)
metadata_df['state_categorical'] = list(to_categorical(metadata_df['state_encoded'], num_classes=num_states))
metadata_df['species_categorical'] = list(to_categorical(metadata_df['species_encoded'], num_classes=num_species))

# Split the data
train_meta_df, val_meta_df = train_test_split(metadata_df, test_size=0.2, random_state=42)

# Build the model
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
x = base_model.output
x = GlobalAveragePooling2D()(x)

state_output = Dense(num_states, activation='softmax', name='state_output')(x)
species_output = Dense(num_species, activation='softmax', name='species_output')(x)

model = Model(inputs=base_model.input, outputs=[state_output, species_output])

# Freeze the base model layers
for layer in base_model.layers:
    layer.trainable = False

# Compile the model
model.compile(optimizer='adam', 
              loss={'state_output': 'categorical_crossentropy', 'species_output': 'categorical_crossentropy'},
              metrics={'state_output': 'accuracy', 'species_output': 'accuracy'})

# Create data generators
train_datagen = ImageDataGenerator(preprocessing_function=lambda x: x)
val_datagen = ImageDataGenerator(preprocessing_function=lambda x: x)

def multi_output_generator(generator, df, batch_size, img_size=(224, 224)):
    gen = generator.flow_from_dataframe(
        dataframe=df,
        directory='.',
        x_col='image_path',
        y_col=['state_encoded', 'species_encoded'],
        class_mode='multi_output',
        target_size=img_size,
        batch_size=batch_size
    )
    while True:
        X, y = next(gen)
        yield X, {'state_output': to_categorical(y[:, 0], num_classes=num_states), 'species_output': to_categorical(y[:, 1], num_classes=num_species)}

batch_size = 32
train_generator = multi_output_generator(train_datagen, train_meta_df, batch_size)
val_generator = multi_output_generator(val_datagen, val_meta_df, batch_size)

# Train the model
history = model.fit(
    train_generator,
    steps_per_epoch=len(train_meta_df) // batch_size,
    epochs=10,
    validation_data=val_generator,
    validation_steps=len(val_meta_df) // batch_size
)

# Save the model and encoders
model.save('metadata_model.h5')
with open('state_encoder.pkl', 'wb') as f:
    pickle.dump(state_encoder, f)
with open('species_encoder.pkl', 'wb') as f:
    pickle.dump(species_encoder, f)

print('Metadata prediction model trained and saved successfully!')