# 0 Import

In [1]:
import torch
import torch.nn as nn
import os
import sys
import numpy as np

sys.path.append(os.path.abspath(os.path.join('..')))
from utils.build_dataset import build_datasets

# 1 VGG model



## 1.1 Import model

In [2]:
from torchvision import models 

vgg_model = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1)
vgg_model.classifier = torch.nn.Identity()
vgg_model.eval()

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

## 1.2 Feature extractor

In [3]:
def extract_features(batch, model):
    with torch.no_grad():
        features = model(batch)
    return features

# 2 RNN

## 2.1 Vocabulary

### 2.1.1 Tokenizer

In [4]:
import string

def tokenize(text: str):
    text = text.lower()

    translator = str.maketrans("","", string.punctuation + string.digits + "\t\r\n")
    text = text.translate(translator)
    text = "<START> " + text + " <END>"
    return [x for x in text.split(" ") if x != ""]


### 2.1.2 Build vocabulary over datasets

In [5]:

# clean captions and add <START> and <END> tokens
def build_tokeninzed_vocabulary(dataloader):
    vocabulary, mxlen = dict(), 0

    for _, captions, img_name in dataloader:
        for id in range(len(img_name)):
            to_append = []
            for i in range(5):
                to_append.append(' '.join(tokenize(captions[i][id])))
                mxlen = max(mxlen, len(to_append[-1].split(" ")))

            img_id = img_name[id].split("/")[-1]
            vocabulary[img_id] = to_append
    
    return vocabulary, mxlen


### 2.1.3 Fill vocabulary

In [6]:
train_dataloader, val_dataloader, test_dataloader, dataloader_train_small, dataloader_val_small = build_datasets()

# train_voc, mxlen1 = build_tokeninzed_vocabulary(train_dataloader)
# val_voc, mxlen2 = build_tokeninzed_vocabulary(val_dataloader)
# test_voc, mxlen3 = build_tokeninzed_vocabulary(test_dataloader)
small_train_voc, mxlen4 = build_tokeninzed_vocabulary(dataloader_train_small)
small_val_voc, mxlen5 = build_tokeninzed_vocabulary(dataloader_val_small)
mxlen = max(mxlen4, mxlen5)

corpus = dict()
corpus["<PAD>"] = 0

def fill_corpus(corpus, voc):
    for lst in voc.values():
        for sentence in lst:
            for word in sentence.split(" "):
                if word not in corpus:
                    corpus[word] = len(corpus)

fill_corpus(corpus, small_train_voc)
fill_corpus(corpus, small_val_voc)

# print(corpus)

## 2.1.4 Pad word with PAD tag

In [7]:
def pad_word(token, mxlen):
    ret = ["<PAD>" for i in range(mxlen - len(token))]
    for word in token: ret.append(word)
    return ret

## 2.2 Embeddings

### 2.2.1 Load glove embeddings

In [8]:
def load_glove_embeddings(glove_file, embedding_dim=100):
    embeddings_index = {}
    
    with open(glove_file, 'r', encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    
    print(f"Loaded {len(embeddings_index)} word vectors.")
    return embeddings_index

# Load GloVe embeddings (using 100-dimensional vectors as an example)
glove_file = "../data/glove/glove.6B.100d.txt"
glove_embeddings = load_glove_embeddings(glove_file, 100)

Loaded 400000 word vectors.


### 2.2.2 Create embedding mapping

In [9]:

def create_embedding_matrix(vocab, glove_embeddings, embedding_dim=100):
    vocab_size = len(vocab)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    
    for word, idx in vocab.items():
        embedding_vector = glove_embeddings.get(word)
        if embedding_vector is not None: # word found
            embedding_matrix[idx] = embedding_vector
        else: # word not found -> random init
            embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))
    
    return embedding_matrix

embedding_matrix = create_embedding_matrix(corpus, glove_embeddings, 100)
embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32) # to torch.tensor

## 2.3 Model definition

In [23]:
class ImageCaptionModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, embedding_matrix, max_length, feature_size=4096, hidden_size=256):
        super(ImageCaptionModel, self).__init__()

        # Feature extractor per le immagini
        self.image_fc = nn.Linear(25088, hidden_size)  # Da 4096 a 256 dimensioni
        self.image_dropout = nn.Dropout(0.5)

        # Modello di embedding per il testo
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False  # Fissa l'embedding (non addestrabile)
        
        # LSTM per il testo
        self.text_dropout = nn.Dropout(0.5)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)

        # Decoder (combinazione di features immagine + testo)
        self.fc1 = nn.Linear(hidden_size * 2, hidden_size)
        self.fc2 = nn.Linear(hidden_size, vocab_size)

    def forward(self, image_features, captions):
        # Elaborazione delle feature delle immagini
        img_feature = self.image_dropout(torch.relu(self.image_fc(image_features)))

        # Elaborazione del testo (didascalie)
        embedded_captions = self.embedding(captions)  # Ottieni embedding pre-addestrati
        embedded_captions = self.text_dropout(embedded_captions)
        
        lstm_out, _ = self.lstm(embedded_captions)  # Output LSTM

        # Prendi l'ultimo stato dell'LSTM (ultimo token della sequenza)
        text_feature = lstm_out[:, -1, :]

        # Concatenazione delle features dell'immagine e del testo
        combined = torch.cat((img_feature, text_feature), dim=1)

        # Passaggio attraverso il decoder fully connected
        output = torch.relu(self.fc1(combined))
        output = self.fc2(output)  # Previsione finale (vettore delle dimensioni del vocabolario)

        return torch.log_softmax(output, dim=1)  # Usare log_softmax per la cross-entropy loss

# 3 Training

## 3.1 Training function definition

In [24]:
def train_model(model, train_dataloader, criterion, optimizer, num_epochs=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.train()

    for epoch in range(num_epochs):
        running_loss = 0.0
        
        for images, captions_list, _ in train_dataloader:
            images = images.to(device)
            
            caption_tokenized = []
            for captions in captions_list:
                five_cap = []
                for caption in captions:
                    word_cap = []
                    for word in pad_word(tokenize(caption), mxlen):
                        word_cap.append(corpus[word])
                    five_cap.append(word_cap)
                caption_tokenized.append(five_cap)
            
            caption_tokenized = torch.tensor(caption_tokenized).to(device)
            
            image_features = extract_features(images, vgg_model)

            for img_idx, captions in enumerate(caption_tokenized):
                img_feature = image_features[img_idx].unsqueeze(0).to(device)

                for caption in captions:
                    input_caption = caption[:-1].to(device) # remove <END> token
                    target_caption = caption[1:].to(device)  # skip <START> token

                    optimizer.zero_grad() # init gradient zero

                    output = model(img_feature, input_caption)
                    loss = criterion(output, target_caption)

                    loss.backward()
                    optimizer.step()

                    running_loss += loss.item()

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_dataloader)}")

In [25]:
cap_model = ImageCaptionModel(len(corpus), embedding_dim=100, embedding_matrix=embedding_matrix, max_length=mxlen)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

cap_model.to(device)
vgg_model.to(device)

train_model(
    cap_model, 
    dataloader_train_small, 
    nn.CrossEntropyLoss(), 
    torch.optim.Adam(cap_model.parameters(), lr=0.001), 
    num_epochs=3
)

  self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))


IndexError: too many indices for tensor of dimension 2