**DATA PREPARATION**

In [55]:
import os
from pickle import load, dump
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical, plot_model
import matplotlib.pyplot as plt
from keras.models import Model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout
from keras.layers import Add
from keras.callbacks import ModelCheckpoint
import string
import numpy as np


def extract_features(directory):
    model = VGG16()
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    print(model.summary())

    features = {}
    for name in os.listdir(directory):
        filename = directory + '\\' + name
        image = load_img(filename, target_size=(224, 224))
        image = img_to_array(image)
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        image = preprocess_input(image)
        feature = model.predict(image, verbose=0)
        image_id = name.split('.')[0]
        features[image_id] = feature
        # print('>', name)
    return features


dataset_img = "..\\train data\\images(6)"
features = extract_features(dataset_img)
print('Extracted features: ', len(features))
dump(features, open('features(6).p', 'wb'))

Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0   

In [83]:
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

dataset_image_path = '..\\train data\\'
doc = load_doc(dataset_image_path + "captions(6).txt")
# print(doc.split('\n'))

*(Необязательно)*

In [84]:
def dump_imgIDs(doc, filename, start, end):
    imgIDs_list = []
    for line in doc.split('\n')[start:end]:
        imgID = line.split(",")[0]
        imgIDs_list.append(imgID)
    imgIDs = "\n".join(imgIDs_list)
    file = open(filename, 'w')
    file.write(imgIDs)
    
dump_imgIDs(doc, "trainImages(6).txt", 0, 15)
dump_imgIDs(doc, "testImages(6).txt", 15, 30)

In [85]:
def load_descriptions(doc):
    mapping = {}
    for line in doc.split('\n'):
        tokens = line.split('.')
        if len(tokens) < 2:
            continue
        image_filename, image_desc = tokens[0], tokens[1:]
        image_id = image_filename.split('.')[0]
        image_desc = ''.join(image_desc)
        if image_id not in mapping:
            mapping[image_id] = []
        mapping[image_id].append(image_desc)
    return mapping

descriptions = load_descriptions(doc)
print('Loaded: ', len(descriptions))
# print(descriptions)

Loaded:  6


In [86]:
def clean_descriptions(descriptions):
    table = str.maketrans('', '', string.punctuation)
    for img, cap in descriptions.items():
        for num_part, caption_part in enumerate(cap):

            caption_part.replace('-', ' ')
            desc = caption_part.split()

            desc = [word.lower() for word in desc]
            desc = [word.translate(table) for word in desc]
            desc = [word for word in desc if(len(word) > 1)]
            desc = [word for word in desc if(word.isalpha())]

            caption_part = ' '.join(desc)
            descriptions[img][num_part] = caption_part

    return descriptions

descriptions = clean_descriptions(descriptions)
# print(descriptions)

In [87]:
def to_vocabulary(descriptions):
    all_desc = set()
    for value in descriptions.values():
        [all_desc.update(d.split()) for d in value]
    return all_desc

vocab = to_vocabulary(descriptions)
# print(vocab)
print('Vocabulary size:', len(vocab))

Vocabulary size: 118


In [91]:
def save_descriptions(descriptions, filename):
    lines = []
    for image_id, image_desc in descriptions.items():
        for desc in image_desc:
            lines.append(image_id + '\t' + desc)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()
    
save_descriptions(descriptions, 'descriptions(6).txt')

**DEVELOPING DEEP LEARNING MODEL**

**Loading Data**

In [92]:
def load_set(filename):
    doc = load_doc(filename)
    dataset = []
    for line in doc.split('\n'):
        if len(line) < 1:
            continue
        identifier = line.split('.')[0]
        dataset.append(identifier)
    return set(dataset)

# trainImages = dataset + 'trainImages(test).txt'
train = load_set('trainImages(6).txt')
print('Training images loaded:', len(train))
print(train)

Training images loaded: 3
{'1000268201_693b08cb0e', '1001773457_577c3a7d70', '1002674143_1b742ab4b8'}


In [94]:
def load_clean_descriptions(filename, dataset):
    doc = load_doc(filename)
    descriptions = {}
    for line in doc.split('\n'):
        tokens = line.split('\t')
        image_filename, image_desc = tokens[0], tokens[1:]
        image_id = image_filename.split('.')[0]
        if image_id in dataset:
            if image_id not in descriptions:
                descriptions[image_id] = []
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
            descriptions[image_id].append(desc)
    return descriptions

train_descriptions = load_clean_descriptions('descriptions(6).txt', train)
print("Training descriptions loaded:", len(train_descriptions))
# print(train_descriptions)

Training descriptions loaded: 3


In [95]:
def load_photo_features(filename, dataset):
    all_features = load(open(filename, 'rb'))
    features = {k: all_features[k] for k in dataset}
    return features

train_features = load_photo_features('features(6).p', train)
print('Features loaded:', len(train_features.keys()))
# for i, feature in enumerate(features.values()):
#     print(f'@{i} - {feature}')

Features loaded: 3


In [117]:
def dict_to_list(descriptions):
    all_desc = []
    for values in descriptions.values():
        [all_desc.append(x) for x in values]
    return all_desc

def create_tokenizer(descriptions):
    lines = dict_to_list(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size:', vocab_size)
dump(tokenizer, open('tokenizer(6).p', 'wb'))

Vocabulary size: 76


*Вариант 1*

In [73]:
def create_sequences(tokenizer, max_length, descriptions, photos, vocab_size):
    x1, x2, y = list(), list(), list()
    # walk through each image identifier
    for key, desc_list in descriptions.items():
        # walk through each description for the image
        for desc in desc_list:
            # encode the sequence
            seq = tokenizer.texts_to_sequences([desc])[0]
            # split one sequence into multiple X,y pairs
            for i in range(1, len(seq)):
                # split into input and output pair
                in_seq, out_seq = seq[:i], seq[i]
                # pad input sequence
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                # encode output sequence
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                # store
                x1.append(photos[key][0])
                x2.append(in_seq)
                y.append(out_seq)
    return array(x1), array(x2), array(y)

def max_length(descriptions):
    lines = dict_to_list(descriptions)
    return max(len(d.split()) for d in lines)

max_length = max_length(train_descriptions)
print('Maximum sentence length:', max_length)

Maximum sentence length: 20


*Вариант 2*

In [97]:
def create_sequences(tokenizer, max_length, desc_list, photo, vocab_size):
    x1, x2, y = [], [], []
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            input_seq, output_seq = seq[:i], seq[i]
            input_seq = pad_sequences([input_seq], maxlen=max_length)[0]
            output_seq = to_categorical([output_seq], num_classes=vocab_size)[0]
            x1.append(photo)
            x2.append(input_seq)
            y.append(output_seq)
    return np.array(x1), np.array(x2), np.array(y)

def max_length(descriptions):
    lines = dict_to_list(descriptions)
    return max(len(d.split()) for d in lines)

max_length = max_length(train_descriptions)
print('Maximum sentence length:', max_length)

Maximum sentence length: 20


**Defining the Model**

In [109]:
def define_model(vocab_size, max_length):
    # Извлечение особенностей
    inputs1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    # Процессор последовательностей
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    # Декодер
    decoder1 = Add()([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    # Связывание всего в единое
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    # Подведение итогов
    print(model.summary())
    # plot_model(model, to_file='models/model.png', show_shapes=True)
    return model

**Fitting the Model**

In [110]:
x1train, x2train, ytrain = create_sequences(tokenizer, max_length, train_descriptions, train_features, vocab_size)

# filename = 'testImages(6).txt'
# test = load_set(filename)
# print('Dataset:', len(test))
# test_descriptions = load_clean_descriptions('descriptions(6).txt', test)
# print('Descriptions: test=', len(test_descriptions))
# test_features = load_photo_features('features(6).p', test)
# print('Photos: test=', len(test_features))

# x1test, x2test, ytest = create_sequences(tokenizer, max_length, test_descriptions, test_features, vocab_size)

In [111]:
def data_generator(descriptions, photos, tokenizer, max_length, vocab_size):
    while 1:
        for key, desc_list in descriptions.items():
            photo = photos[key][0]
            in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo, vocab_size)
            yield [in_img, in_seq], out_word

generator = data_generator(train_descriptions, train_features, tokenizer, max_length, vocab_size)
inputs, outputs = next(generator)
print(inputs[0].shape)
print(inputs[1].shape)
print(outputs.shape)


(52, 4096)
(52, 20)
(52, 76)


In [113]:
model = define_model(vocab_size, max_length)
epochs = 20
steps = len(train_descriptions)
for i in range(epochs):
    generator =  data_generator(train_descriptions, train_features, tokenizer, max_length, vocab_size)
    history = model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    
    # plt.plot(history.history['accuracy'])
    # plt.plot(history.history['val_accuracy'])
    # plt.title('model accuracy')
    # plt.ylabel('accuracy')
    # plt.xlabel('epoch')
    # plt.legend(['train', 'test'], loc='upper left')
    # plt.show()
    # # summarize history for loss
    # plt.plot(history.history['loss'])
    # plt.plot(history.history['val_loss'])
    # plt.title('model loss')
    # plt.ylabel('loss')
    # plt.xlabel('epoch')
    # plt.legend(['train', 'test'], loc='upper left')
    # plt.show()
    
    model.save('models(6)/model_' + str(i+1) + '.h5')

Model: "model_9"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_16 (InputLayer)          [(None, 20)]         0           []                               
                                                                                                  
 input_15 (InputLayer)          [(None, 4096)]       0           []                               
                                                                                                  
 embedding_3 (Embedding)        (None, 20, 256)      19456       ['input_16[0][0]']               
                                                                                                  
 dropout_6 (Dropout)            (None, 4096)         0           ['input_15[0][0]']               
                                                                                            

In [116]:
from nltk.translate.bleu_score import corpus_bleu


def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def generate_desc(model, tokenizer, photo, max_length):
    input_text  = 'startseq'
    for i in range(max_length):
        seq = tokenizer.texts_to_sequences([input_text])[0]
        seq = pad_sequences([seq], maxlen=max_length)
        yhat = model.predict([photo, seq], verbose=0)
        yhat = np.argmax(yhat)
        word = word_for_id(yhat, tokenizer)
        if word is None:
            break
        input_text += ' ' + word
        if word == 'endseq':
            break
    return input_text


In [120]:
from tensorflow.keras.models import load_model
from pickle import load


def evaluate_model(model, descriptions, photos, tokenizer, max_length):
    actual, predicted = [], []
    for key, desc_list in descriptions.items():
        yhat = generate_desc(model, tokenizer, photos[key], max_length)
        references = [d.split() for d in desc_list]
        actual.append(references)
        predicted.append(yhat.split())
    print('BLEU-1:', corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2:', corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3:', corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4:', corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
    
testImages = 'testImages(6).txt'
test = load_set(testImages)
print('Dataset:', len(test))
test_descriptions = load_clean_descriptions('descriptions(6).txt', test)
print('Descriptions: test=', len(test_descriptions))
test_features = load_photo_features('features(6).p', test)
print('Photos: test=', len(test_features))
tokenizer = load(open('tokenizer(6).p', 'rb'))
model = load_model('models(6)/model_20.h5')
max_length = 20

evaluate_model(model, test_descriptions, test_features, tokenizer, max_length)

Dataset: 3
Descriptions: test= 3
Photos: test= 3
BLEU-1: 0.3807936770679397
BLEU-2: 0.11687506392880498
BLEU-3: 1.2924959128760296e-93
BLEU-4: 4.632058777330975e-155


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
