In [1]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import csv
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from torchtext.data import get_tokenizer
import string
import math
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from tqdm import tqdm

In [2]:

# Choix de device

# device = 'cpu'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Définition de START et END pour éviter des fautes de frappe :

START = "<start>"
END = "<end>"

MAX_VOCAB_SIZE = 1000
MAX_SEQ_LENGTH = 25

BATCH_SIZE = 16

EPOCHS = 20

DATASET_PATH = 'dataset/Flicker8k_Dataset/'

In [4]:
#
url_flicker_train = 'dataset.csv';
def flicker_source(fName):
    """
    Une fonction qui charge un jeu de données, et renvoie un générateur
    permettant de le parcourir.

    Résultat:
        un générateur renvoyant des couples de string "nom d'image", "label de l'image"
    """
    images = []
    descriptions = []
    similarities = []
    with open(fName, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=',')
        next(reader) # skip the first row (column headers)
        for row in reader:
            texte, image, similarity = row
            images.append(image)
            descriptions.append(texte)
            similarities.append(similarity)
        return images, descriptions, similarities

def flicker_train():
    return flicker_source(url_flicker_train)

Préaparation récupération des données du dataset <br>
Il est consituer de 30000 paires image/description avec une similitude de 1 <br>
Et 30000 paires image/description avec une similitude de 0 avec les paire qui ont été tirées aléatoirement dans le dataset d'origine

In [5]:
training_images, training_descriptions, train_similarities = flicker_train()

In [6]:
# Charger le tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Charger le modèle BERT
model = BertModel.from_pretrained('bert-base-uncased')

model.to(device)
def extract_text_features(description):
    tokens = tokenizer.encode(description, add_special_tokens=True)

    # Conversion des tokens en tenseur PyTorch
    tokens_tensor = torch.tensor([tokens]).to(device)

    # Encodage du texte avec le modèle BERT
    with torch.no_grad():
        outputs = model(tokens_tensor)

    # Récupération de la représentation du texte
    last_hidden_state = outputs[0]
    return torch.mean(last_hidden_state, dim=1).squeeze()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


On génére les features pour les 60000 descriptions

In [7]:
description_features = []
for description in tqdm(training_descriptions, desc="Traitement en cours", unit="s"):
    description = description.replace(START, '')
    description = description.replace(END, '')
    description_features.append(extract_text_features(description))
    
# print(description_features)

Traitement en cours: 100%|██████████| 60000/60000 [07:07<00:00, 140.23s/s]


In [8]:
 print(training_descriptions[4],description_features[4])

<start> Several different colored dogs running in a snowy field <end> tensor([-1.1355e-01, -9.7810e-02, -3.4551e-01,  9.0662e-02,  4.7795e-01,
        -1.0824e-01,  5.1027e-02,  5.1762e-01, -3.8062e-01,  6.2132e-02,
        -2.0322e-01, -7.5869e-01, -2.7041e-01,  5.3970e-01, -5.8855e-02,
         3.1045e-01, -2.0346e-01,  2.4298e-01,  1.3010e-01,  2.7687e-01,
        -2.3603e-01, -3.1119e-01, -3.6811e-01, -4.0019e-02,  5.4555e-01,
        -7.0280e-02,  3.5638e-01,  4.9211e-01, -1.7466e-01,  7.2691e-02,
         1.8868e-01, -6.1337e-02,  1.3752e-01,  2.0360e-02,  2.3796e-01,
         2.7234e-02, -1.6427e-02, -2.2346e-01, -4.7358e-01,  2.9290e-01,
        -2.5355e-01,  1.2835e-01,  1.9095e-02, -6.7741e-02,  7.4996e-02,
        -4.9261e-01,  3.4347e-01, -3.2652e-01,  5.4677e-01,  1.0841e-01,
        -5.4180e-01,  3.8495e-01, -9.1684e-02,  1.8326e-01, -3.1477e-02,
         7.9344e-01,  9.3264e-02, -4.1636e-01, -1.6365e-01, -8.8513e-02,
         5.8924e-01, -2.4082e-01, -1.6141e-01, -5.2632

In [9]:
model = models.resnet18(pretrained=True)
model.to(device)

def get_image_feature(path):
    img = Image.open(path)
    preprocess = transforms.Compose([
       transforms.Resize(256),
       transforms.CenterCrop(224),
       transforms.ToTensor(),
       transforms.Normalize(mean=[0.485, 0.456, 0.406],
                            std=[0.229, 0.224, 0.225])
    ])
    img_tensor = preprocess(img).to('cuda')
    img_tensor.unsqueeze_(0)

    # Utilisez le modèle pour extraire les features
    with torch.no_grad():
        features = model.conv1(img_tensor)
        features = model.bn1(features)
        features = model.relu(features)
        features = model.maxpool(features)

        features = model.layer1(features)
        features = model.layer2(features)
        features = model.layer3(features)
        features = model.layer4(features)
        features = model.avgpool(features)
        
    return features



In [10]:
features_images = []
for training_image in tqdm(training_images, desc="Traitement en cours", unit="s"):
    features_images.append(get_image_feature(DATASET_PATH + training_image))

Traitement en cours: 100%|██████████| 60000/60000 [08:28<00:00, 117.95s/s]


In [11]:
print(features_images[4].flatten())

tensor([0.9556, 0.9599, 0.8676, 1.0039, 0.9231, 0.8512, 0.8949, 1.1562, 0.9777,
        0.9736, 0.9122, 0.9040, 0.8448, 0.9061, 0.9255, 0.9709, 0.9136, 1.3323,
        0.8999, 0.8524, 0.8731, 1.1234, 0.9508, 0.9225, 0.9188, 0.9454, 0.9588,
        0.9864, 0.8425, 0.9353, 0.8861, 0.8583, 0.8379, 0.7688, 0.9163, 0.9177,
        0.9181, 0.9003, 0.8640, 0.8563, 0.8954, 0.8394, 0.9701, 0.9806, 0.8066,
        0.8787, 0.8869, 1.0629, 0.8944, 0.8379, 0.9146, 0.9598, 0.8804, 0.8707,
        0.9027, 0.9286, 1.0970, 0.9469, 1.0280, 0.9272, 0.9165, 1.0446, 0.9985,
        0.9341, 0.8445, 1.0708, 0.8201, 0.9653, 0.9943, 0.8317, 0.8847, 0.8043,
        0.8890, 0.8678, 0.9745, 0.9167, 0.8904, 0.8578, 0.8749, 0.8696, 0.8499,
        1.0301, 0.9445, 1.1747, 0.9033, 0.9459, 0.9882, 0.8696, 0.9982, 0.7786,
        0.8335, 0.9399, 0.8588, 0.9296, 0.8640, 0.8855, 0.9158, 0.8299, 0.9602,
        0.8547, 0.9612, 1.0683, 1.0615, 0.9995, 0.8955, 0.8042, 0.9921, 0.9567,
        0.8228, 0.9124, 0.8597, 0.8571, 

In [12]:
torch.cat((description_features[0], features_images[0].flatten())).shape
concat_features = []
for i in tqdm(range(len(features_images)), desc="Traitement en cours", unit="s"):
    concat_features.append(torch.cat((description_features[i], features_images[i].flatten())))

Traitement en cours: 100%|██████████| 60000/60000 [00:01<00:00, 39853.08s/s]


In [13]:
print(concat_features[4])

tensor([-0.1136, -0.0978, -0.3455,  ...,  0.9936,  1.0316,  0.8956],
       device='cuda:0')


L'objectif de ce modèle est de sortir un taux de similarité compris entre 0 et 1 pour les paire images/descriptions

In [15]:
# Définir une classe Dataset pour vos données d'entraînement
class ImageDescDataset(Dataset):
    def __init__(self, features, similarities):
        self.features = features
        self.similarities = similarities
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        feature = self.features[idx]
        similarity = self.similarities[idx]
        return feature, similarity

# Définir votre modèle
class SimilarityModel(nn.Module):
    def __init__(self, input_size):
        super(SimilarityModel, self).__init__()
        hidden_size = input_size // 2
        self.fc1 = nn.Linear(input_size, hidden_size)
        
        input_size = hidden_size
        hidden_size = input_size // 2
        self.fc2 = nn.Linear(input_size, hidden_size)
        
        input_size = hidden_size
        hidden_size = input_size // 2
        self.fc3 = nn.Linear(input_size, hidden_size)
        
        input_size = hidden_size
        self.fc4 = nn.Linear(input_size, 1)
        
    def forward(self, features):
        x = features
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        x = torch.relu(x)
        x = self.fc3(x)
        x = torch.relu(x)
        x = self.fc4(x)
        x = torch.sigmoid(x)
        return x

In [16]:
train_similarities = [float(x) for x in train_similarities]
train_similarities = torch.tensor(train_similarities).to(device)

In [18]:
# Définir votre fonction de perte et l'optimiseur
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

features = concat_features
similarities = torch.ones((len(concat_features))).to(device)
train_dataset = ImageDescDataset(features[:24000], train_similarities[:24000])
val_dataset = ImageDescDataset(features[24000:27000], train_similarities[24000:27000])
test_dataset = ImageDescDataset(features[27000:], train_similarities[27000:])

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

model = SimilarityModel(input_size=concat_features[0].shape[0]).to(device)

for epoch in range(EPOCHS):
     # Boucle d'entraînement
    for i, (feature, similarity) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(feature)
        loss = criterion(outputs, similarity)
        loss.backward()
        optimizer.step()
        # Afficher la perte toutes les 50 itérations
        if (i+1) % 1 == 0:
            print(f'Epoch {epoch+1}, iteration {i+1}, loss = {loss.item()}')
    
    # Boucle de validation
    with torch.no_grad():
        total_loss = 0
        for feature, similarity in val_loader:
            outputs = model(feature)
            loss = criterion(outputs, similarity)
            total_loss += loss.item() * feature.size(0)
        val_loss = total_loss / len(val_dataset)
        print(f'Epoch {epoch+1}, validation loss = {val_loss}')
        
# Tester les performances de votre modèle sur l'ensemble de test
with torch.no_grad():
    total_loss = 0
    for feature, similarity in test_loader:
        outputs = model(feature)
        loss = criterion(outputs, similarity)
        total_loss += loss.item() * feature.size(0)
    test_loss = total_loss / len(test_dataset)
    print(f'Test loss = {test_loss}')

Epoch 1, iteration 1, loss = 0.24733543395996094
Epoch 1, iteration 2, loss = 0.2500622570514679
Epoch 1, iteration 3, loss = 0.2473144680261612
Epoch 1, iteration 4, loss = 0.2500506043434143
Epoch 1, iteration 5, loss = 0.25187647342681885
Epoch 1, iteration 6, loss = 0.2510131895542145
Epoch 1, iteration 7, loss = 0.25192925333976746
Epoch 1, iteration 8, loss = 0.2500564455986023
Epoch 1, iteration 9, loss = 0.24819374084472656
Epoch 1, iteration 10, loss = 0.2491045743227005
Epoch 1, iteration 11, loss = 0.2500595450401306
Epoch 1, iteration 12, loss = 0.24830374121665955
Epoch 1, iteration 13, loss = 0.24913427233695984
Epoch 1, iteration 14, loss = 0.24911510944366455
Epoch 1, iteration 15, loss = 0.2482355535030365
Epoch 1, iteration 16, loss = 0.2509803771972656
Epoch 1, iteration 17, loss = 0.2527487874031067
Epoch 1, iteration 18, loss = 0.2500535845756531
Epoch 1, iteration 19, loss = 0.2491278350353241
Epoch 1, iteration 20, loss = 0.24734936654567719
Epoch 1, iteration 21