In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Add
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import pickle

In [None]:
# Carica le features estratte dalle immagini utilizzando InceptionV3
def load_image_features(image_paths, model):
    image_features = {}
    for img_path in image_paths:
        img = load_img(img_path, target_size=(299, 299))
        img = img_to_array(img)
        img = preprocess_input(img)
        img = np.expand_dims(img, axis=0)
        feature = model.predict(img)
        image_features[img_path] = feature
    return image_features

# Carica le descrizioni delle immagini
def load_captions(filename):
    with open(filename, 'r') as f:
        lines = f.read().strip().split('\n')
    captions = {}
    for line in lines:
        image_id, caption = line.split('\t')
        if image_id not in captions:
            captions[image_id] = []
        captions[image_id].append(caption)
    return captions



In [None]:
# Crea un dizionario di mapping da parole a interi e viceversa
def create_word_mapping(captions):
    all_captions = ' '.join(captions)
    words = all_captions.split()
    unique_words = set(words)
    word_to_int = {word: idx + 1 for idx, word in enumerate(unique_words)}
    int_to_word = {idx + 1: word for idx, word in enumerate(unique_words)}
    return word_to_int, int_to_word

# Preprocessa le descrizioni delle immagini
def preprocess_captions(captions, word_to_int, max_length):
    preprocessed_captions = []
    for img_id, caption_list in captions.items():
        for caption in caption_list:
            caption_words = caption.split()
            caption_ints = [word_to_int[word] for word in caption_words]
            preprocessed_captions.append(caption_ints)
    preprocessed_captions = pad_sequences(preprocessed_captions, maxlen=max_length, padding='post')
    return preprocessed_captions



In [None]:

# Caricamento delle immagini e delle descrizioni
image_paths = [...]  # Lista dei percorsi delle immagini
image_captions = load_captions('Flickr8k.token.txt')
word_to_int, int_to_word = create_word_mapping(image_captions)
max_caption_length = max(len(caption.split()) for caption_list in image_captions.values())

# Preprocessamento delle descrizioni
preprocessed_captions = preprocess_captions(image_captions, word_to_int, max_caption_length)

# Caricamento del modello InceptionV3 pre-addestrato
image_model = InceptionV3(include_top=False, weights='imagenet')

# Estrazione delle feature delle immagini e salvataggio su file
image_features = load_image_features(image_paths, image_model)
with open('image_features.pkl', 'wb') as f:
    pickle.dump(image_features, f)

# Divisione dei dati in training e validation sets
train_image_paths, val_image_paths, train_captions, val_captions = train_test_split(
    image_paths, preprocessed_captions, test_size=0.2, random_state=42
)


In [None]:

# Creazione del modello di image captioning
def build_captioning_model(vocab_size, max_length):
    # Modello per le feature dell'immagine
    image_input = Input(shape=(2048,))
    image_model = Dense(256, activation='relu')(image_input)

    # Modello per le sequenze di parole
    caption_input = Input(shape=(max_length,))
    caption_model = Embedding(input_dim=vocab_size, output_dim=256, input_length=max_length)(caption_input)
    caption_model = LSTM(256)(caption_model)

    # Combinazione delle feature dell'immagine e della sequenza di parole
    decoder_input = Add()([image_model, caption_model])
    output = Dense(vocab_size, activation='softmax')(decoder_input)

    model = Model(inputs=[image_input, caption_input], outputs=output)
    return model

# Parametri del modello
vocab_size = len(word_to_int) + 1
embedding_dim = 256
max_length = max_caption_length
epochs = 10
batch_size = 64

# Costruzione del modello
caption_model = build_captioning_model(vocab_size, max_length)
caption_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Addestramento del modello
train_steps_per_epoch = len(train_image_paths) // batch_size
val_steps_per_epoch = len(val_image_paths) // batch_size

# Caricamento delle features delle immagini
with open('image_features.pkl', 'rb') as f:
    image_features = pickle.load(f)


In [None]:

# Creazione dei generatori di dati
def data_generator(image_paths, captions, image_features, word_to_int, vocab_size, batch_size, max_length):
    while True:
        for i in range(0, len(image_paths), batch_size):
            batch_image_paths = image_paths[i:i + batch_size]
            batch_captions = captions[i:i + batch_size]
            batch_images = np.array([image_features[path][0] for path in batch_image_paths])
            batch_captions_input = batch_captions[:, :-1]
            batch_captions_output = to_categorical(batch_captions[:, 1:], num_classes=vocab_size)
            yield ([batch_images, batch_captions_input], batch_captions_output)

train_data_generator = data_generator(train_image_paths, train_captions, image_features, word_to_int, vocab_size, batch_size, max_length)
val_data_generator = data_generator(val_image_paths, val_captions, image_features, word_to_int, vocab_size, batch_size, max_length)

# Addestramento del modello
history = caption_model.fit(
    train_data_generator,
    steps_per_epoch=train_steps_per_epoch,
    epochs=epochs,
    validation_data=val_data_generator,
    validation_steps=val_steps_per_epoch,
    callbacks=[ModelCheckpoint('caption_model.h5', save_best_only=True)]
)

# Salvataggio del modello
caption_model.save('caption_model_final.h5')