In [1]:
import string
import numpy as np
from PIL import Image
import os
from pickle import dump, load

In [2]:
from keras.applications.xception import xception, preprocess_input
from keras.applications import Xception
from keras.preprocessing.image import load_img, img_to_array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers.merge import add
from keras.models import Model, load_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout

# petite bibliothèque pour voir le processus des boucles
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

Using TensorFlow backend.


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

  from pandas import Panel


In [3]:
# Chargement d'un fichier texte en mémoire
def load_doc(filename):
    #Ouverture du fichier en lecture seule
    file = open (filename, 'r')
    text = file.read()
    file.close()
    return text

In [4]:
#obtenir toutes les images avec leurs légendes
def all_img_captions(filename):
    file = load_doc(filename)
    captions = file.split('\n')
    descriptions = {}
    for caption in captions[:-1]:
        img, caption = caption.split('\t')
        if img [:-2] not in descriptions:
            descriptions[img[:-2]] = []
        else:
            descriptions[img[:-2]].append(caption)
            
    return descriptions

In [5]:
#Nettoyage des données, suppression des signes de ponctuation et des mots contenant des nombres
def cleaning_text(captions):
    table = str.maketrans('', '' , string.punctuation)
    for img,caps in captions.items():
        for i, img_caption in enumerate(caps):
            
            img_caption.replace("-", " ")
            desc = img_caption.split()
            
            #convertit en minuscules
            desc = [word.lower() for word in desc]
            
            #supprimer la ponctuation de chaque token
            desc = [word.translate(table) for word in desc]
            
            #remove hanging 's and a
            desc = [word for word in desc if (len(word)>1)]
            
            #remove tokens with numbers in them
            desc = [word for word in desc if (word.isalpha())]
            
            #reconvertir en chaîne
            img_caption = ' '.join(desc)
            captions[img][i] = img_caption
            
    return captions

In [6]:
def text_vocabulary(descriptions):
    #construire le vocabulaire de tous les mots uniques
    vocab = set()
    
    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]
        
    return vocab

In [7]:
#Toutes les descriptions dans un seul fichier
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + '\t' + desc)
    data = "\n".join(lines)
    file = open (filename, "w")
    file.write(data)
    file.close()

In [8]:
#définissez ces chemins en fonction du dossier de projet dans votre système
dataset_text = "/home/richmond/Desktop/PROJETS/Image Caption Generator/Flickr8k_text"

dataset_images = "/home/richmond/Desktop/PROJETS/Image Caption Generator/Flickr8k_Dataset/Flicker8k_Dataset"

In [9]:
caption = load_doc("Flickr8k_text/" + "Flickr8k.token.txt")

In [10]:
for caption in caption[:-1]:
    print(caption)
    break

1


In [11]:
descriptions = {}
descriptions['2']= []

In [12]:
file = load_doc("Flickr8k_text/" + "Flickr8k.token.txt")
captions = file.split('\n')
descriptions = {}

In [13]:
#nous préparons nos données textuelles
filename = "Flickr8k_text/" + "Flickr8k.token.txt"

#chargement du fichier contenant toutes les données

#les mapper dans le dictionnaire de descriptions img à 5 légendes
descriptions = all_img_captions (filename)
print("Longueur des descriptions = ", len(descriptions))

#nettoyage des descriptions
clean_descriptions = cleaning_text(descriptions)

#construire la description
vocabulary = text_vocabulary(clean_descriptions)
print("Longueur des vocabulaires", len(vocabulary))

#enregistrer chaque description dans un fichier
save_descriptions(clean_descriptions, "descriptions.txt")

Longueur des descriptions =  8092
Longueur des vocabulaires 7951


In [14]:
# Extrayons maintenant les fonctionnalités de notre modèle xception
def extract_features(directory):
        model = Xception( include_top=False, pooling='avg' )
        features = {}
        for img in tqdm(os.listdir(directory)):
            filename = directory + "/" + img
            image = Image.open(filename)
            image = image.resize((299,299))
            image = np.expand_dims(image, axis=0)
            #image = preprocess_input(image)
            image = image/127.5
            image = image - 1.0
            feature = model.predict(image)
            features[img] = feature
        return features
#2048 feature vector
features = extract_features(dataset_images)
dump(features, open("features.p","wb"))

HBox(children=(IntProgress(value=0, max=8091), HTML(value='')))




In [15]:
features = load(open("features.p", "rb"))

In [16]:
# charger les donnees
def load_photos (filename):
    file = load_doc(filename)
    photos = file.split("\n")[:-1]
    return photos

In [17]:
def load_clean_descriptions(filename, photos):
    #Chargement de clean_descriptions
    file = load_doc(filename)
    descriptions = {}
    for line in file.split("\n"):
        
        words = line.split()
        if len (words) < 1:
            continue
            
        image, image_caption = words[0], words[1:]
        
        if image in photos:
            if image not in descriptions:
                descriptions[image] = []
            desc = '<start>' + " ".join(image_caption) + '<end>'
            descriptions[image].append(desc)
            
    return descriptions

In [18]:
def load_features(photos):
    #loading all features
    all_features = load(open("features.p", "rb"))
    #selectinng only needed features
    features = {k:all_features[k] for k in photos}
    return features

filename = dataset_text + "/" + "Flickr_8k.trainImages.txt"

train_imgs = load_photos(filename)
train_descriptions = load_clean_descriptions("descriptions.txt", train_imgs)
train_features = load_features(train_imgs)

In [19]:
#Cconverting dictionary to clean list of descriptions
def dict_to_list(descriptions):
    all_desc = []
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

#creating tokenizer class
# this will vectorise text corpus
#  each integerwill represent token in dictionary

from keras.preprocessing.text import Tokenizer

def create_tokenizer(descriptions):
    desc_list = dict_to_list(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    return tokenizer

# give each word an index, and store that into tokenizer.p pickle file
tokenizer = create_tokenizer(train_descriptions)
dump(tokenizer, open('tokenizer.p', 'wb'))
vocab_size = len(tokenizer.word_index) + 1
vocab_size

#calcule longueur maximale des descriptions
def max_lenght(descriptions):
    desc_list = dict_to_list(descriptions)
    return max(len(d.split()) for d in desc_list)

max_lenght = max_lenght(descriptions)
max_lenght

32

In [20]:
#créer des paires de séquences d'entrée-sortie à partir de la description de l'image

#générateur de données, utilisé par model.fit_generator()
def data_generator(descriptions, features, tokenizer, max_lenght):
    while 1:
        for key, descriptions_list in descriptions.items():
            #récupérer des fonctionnalités photo
            feature = features[key][0]
            #input_image, input_sequence, output_word = create_sequences(tokenizer, max_lenght, descriptions_list, feature)
            #yield [[input_image, input_sequence], output_word]
            
        def create_sequences(tokenizer, max_lenght, desc_list, feature):
            X1, X2, y = list(), list(), list()
            
            #parcourir chaque description de l'image
            for desc in desc_list:
                #encoder la séquence
                seq = tokenizer.texts_to_sequences([desc])[0]
                
                #diviser une séquence en plusieurs paires X, y
                for i in range (1, len(seq)):
                    #divisé en paire d'entrée et de sortie
                    in_seq, out_seq = seq[:i], seq[i]
                    
                    #séquence d'entrée des pads
                    in_seq = pad_sequences([in_seq], maxlen = max_lenght) [0]
                    
                    #encoder la séquence de sortie
                    out_seq = to_categorical([out_seq], num_classes = vocab_size)[0]
                    
                    #store
                    X1.append(feature)
                    X2.append(in_seq)
                    y.append(out_seq)
                
                return np.array(X1), np.array(X2), np.array(y)
        input_image, input_sequence, output_word = create_sequences(tokenizer, max_lenght, descriptions_list, feature)
        yield [[input_image, input_sequence], output_word]            
    #vous pouvez vérifier la forme de l'entrée et de la sortie de votre modèle
    [a,b],c = next(data_generator (train_descriptions, features, tokenizer, max_lenght))
    
    a.shape, b.shape, c.shape
    #((47, 2048), (47, 32), (47, 7577))

In [21]:
from keras.utils import plot_model
from keras.engine import input_layer

#définir le modèle de sous-titrage
def define_model(vocab_size, max_lenght):
    
    #fonctionnalités du modèle CNN pressées de 2048 à 256 nœuds
    inputs1 = Input(shape = (2048, ))
    fe1 = Dropout(0.5) (inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    
    #Modèle de séquence LSTM
    inputs2 = Input(shape= (max_lenght,))
    se1 = Embedding(vocab_size, 256, mask_zero=True) (inputs2)
    se2 = Dropout(0.5) (se1)
    se3 = LSTM(256)(se2)
    
    #Fusion des deux modèles
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu') (decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
    #attachez-les ensemble [image, seq] [word]
    model = Model(inputs = [inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    #résumer le modèle
    print(model.summary())
    plot_model(model, to_file='model.png', show_shapes=True)
    
    return model

In [None]:
#former notre modèle
print('Dataset: ', len(train_imgs))
print('Descriptions: train =', len(train_descriptions))
print('Photos: train =', len(train_features))
print('vocabulary Size:', vocab_size)
print('Description Lenght:', max_lenght)

model = define_model(vocab_size, max_lenght)
epochs = 100



steps = len(train_descriptions)

#faire un répertoire des modèles pour sauvegarder nos modèles
if not os.path.exists("models"):
    os.mkdir("models")

for i in range(epochs):
    generator = data_generator(train_descriptions, train_features, tokenizer, max_lenght)
    model.fit_generator(generator, epochs=1, steps_per_epoch = steps, verbose = 1)
    model.save("models/model_" + str(i) + ".h5")