In [1]:

# Choix de device

# device = 'cpu'
device = 'cuda'

In [2]:
# Définition de START et END pour éviter des fautes de frappe :

START = "<start>"
END = "<end>"

MAX_VOCAB_SIZE = 1000
MAX_SEQ_LENGTH = 25

In [3]:

import pickle
import random
from collections import Counter
from functools import reduce
from itertools import islice

import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.functional as F
import torch.nn as nn
import torch.nn.functional as F
import torch.optim
import torch.utils.data as D
import torchtext
from numpy.random import choice
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data.dataloader import DataLoader
from torchtext.data import get_tokenizer
from torchtext.data.metrics import bleu_score
from torchtext.utils import download_from_url
from torchtext.vocab import Vectors, Vocab
import torchtext.vocab as V
import csv
import nltk
from nltk.corpus import stopwords
import string
import math
from transformers import BertTokenizer, BertModel
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from math import log
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
def preprocess_text(captions):
    nltk.download('stopwords')
    STOPWORDS = set(stopwords.words('english'))
    nltk.download('wordnet')
    lemmatizer = WordNetLemmatizer()

    # Tokenization des captions
    tokenizer = get_tokenizer("basic_english")
    tokenized_captions = []
    descriptions = []
    for caption in captions:
        description = caption.lower()
        description = description.replace(START, '')
        description = description.replace(END, '')
        description = description.translate(str.maketrans('', '', string.punctuation))
        description = description.split()
        description = [lemmatizer.lemmatize(word) for word in description if word not in STOPWORDS]
        description = ' '.join(description)
        descriptions.append(description)
    return descriptions

In [5]:
#
url_flicker_train = 'flickr_8k_train_dataset.txt';
def flicker_source(fName):
    """
    Une fonction qui charge un jeu de données, et renvoie un générateur
    permettant de le parcourir.

    Résultat:
        un générateur renvoyant des couples de string "nom d'image", "label de l'image"
    """
    images = []
    descriptions = []
    with open(fName, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t')
        next(reader) # skip the first row (column headers)
        for row in reader:
            image, texte = row
            images.append(image)
            descriptions.append(texte)
        return images, descriptions

def flicker_train():
    return flicker_source(url_flicker_train)

In [6]:
training_images, training_descriptions = flicker_train()
captions = preprocess_text(training_descriptions)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
corpus = [word for caption in captions for word in caption.split(' ')]
print(math.ceil(len(set(corpus))*0.33))

2138


In [8]:
def get_top_words(descriptions, max_features=100):

    # Créer un vecteur de TF-IDF
    vectorizer = TfidfVectorizer(max_features=max_features)
    X = vectorizer.fit_transform(corpus)

    # Récupérer les mots les plus importants
    return list(vectorizer.vocabulary_.keys())

In [9]:
vocab = get_top_words(captions, math.ceil(len(set(corpus))*0.25))

In [10]:
descriptions = []
for caption in captions:
    valid_words = []
    for word in caption.split(' '):
        if word in vocab:
            valid_words.append(word)
    descriptions.append(" ".join(valid_words))

In [25]:
# Charger le tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Charger le modèle BERT
model = BertModel.from_pretrained('bert-base-uncased')

# Utiliser le GPU si disponible
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to('cuda')

def extract_text_features(description):
    tokens = tokenizer.encode(description, add_special_tokens=True)

    # Conversion des tokens en tenseur PyTorch
    tokens_tensor = torch.tensor([tokens]).to('cuda')

    # Encodage du texte avec le modèle BERT
    with torch.no_grad():
        outputs = model(tokens_tensor)

    # Récupération de la représentation du texte
    last_hidden_state = outputs[0]
    return torch.mean(last_hidden_state, dim=1).squeeze()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
description_features = []
for description in descriptions:
    description_features.append(extract_text_features(description))
    
print(description_features.shape)

In [26]:
description_features2 = []
for description in descriptions:
    description_features2.append(extract_text_features(description))

In [33]:
print(description_features2)

tensor([ 1.3747e-01,  2.7786e-01, -1.3468e-01, -2.2327e-01,  5.9852e-02,
         1.5798e-01,  4.8881e-01,  2.0686e-01, -1.1174e-01,  5.6133e-02,
         4.1083e-02, -2.4741e-01, -2.0081e-01,  4.5355e-01, -3.8095e-01,
        -5.7409e-03,  6.5971e-02,  3.5838e-01,  2.3705e-01,  6.4220e-02,
        -7.8322e-02, -2.1235e-01, -2.6422e-01, -1.8834e-01,  4.4314e-01,
         2.1382e-01,  1.6894e-01,  4.7405e-01, -1.1095e-01, -1.5089e-01,
         2.3129e-01, -3.2842e-01, -9.5386e-02,  3.1948e-01,  5.0823e-01,
        -1.5343e-02, -1.2638e-01, -1.0039e-01, -4.5734e-01,  1.0978e-01,
        -7.2037e-02, -1.0122e-01, -3.0927e-01,  2.3023e-01, -6.4082e-02,
         2.5440e-02, -9.3549e-02, -3.5182e-02,  3.9963e-01, -4.4188e-01,
        -4.1990e-01,  1.3455e-01, -5.1730e-02,  8.4842e-02, -3.1853e-01,
         2.1371e-01,  3.3211e-01,  1.8126e-02, -2.7001e-01, -3.9065e-01,
         1.1179e-01, -1.7751e-01,  1.6040e-01, -9.5923e-02, -2.2220e-01,
         2.3655e-01, -6.1176e-02,  5.0100e-01, -3.4

In [13]:
caption_features = []
for indexed_caption in indexed_captions:
    caption_features.append(extract_text_features(indexed_caption))

NameError: name 'indexed_captions' is not defined

In [None]:
print(torch.tensor(caption_features).shape)