# Main Library

In [62]:
import string
import os
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

# from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.xception import Xception
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding, Input
from keras.layers import add

import pickle

## Loading Data

In [2]:
path = "D:\\Courses language programming\\6_Deep Learning\\Image Caption Generator\\text-image caption"
token = "caption.token.txt"

file_name = path + "\\" + token

def Readfile(path):
    file = open(path, "r")
    info = file.read()
    file.close()
    
    return info

def Analysis(path):
    data = Readfile(file_name)
    data = data.split("\n")

    discription = {}
    for recoud in data:
        img, caption = recoud.split("\t")
        img = img[:-2]
        if img in discription:
            discription[img].append(caption)
        else:
            discription[img] = [caption]
    
    return discription

data = Analysis(file_name)

In [3]:
def preprocessing(caption):    
    punc = str.maketrans("", "", string.punctuation)

    c = [word.lower() for word in caption.split() if (len(word) > 1) and (word.isalpha())]
    c = [word.translate(punc) for word in c]
    
    return " ".join(c)
    
def clean_text(data):
    for image, caption in data.items():
        for indx, caption in enumerate(caption):
            data[image][indx] = preprocessing(caption)
            
    return data



cleaning_data = clean_text(data)

In [4]:
def generate_resposity(data):
    repostery_voc = set()

    for img in data.keys():
        [repostery_voc.update(caption.split()) for caption in data[img]]
        
    return repostery_voc


def write_file(path, data):
    lines = []
    for img, caption in data.items():
        for caption in caption:
            lines.append(img + "\t" + caption)
    lines = "\n".join(lines)
    
    file = open(path, 'w')
    file.write(lines)
    file.close()
    
write_file(r"D:\Courses language programming\6_Deep Learning\Image Caption Generator\text-image caption\cleaned_text.txt", cleaning_data)

rep_voc = generate_resposity(cleaning_data)

In [5]:
len(rep_voc)

8357

In [6]:
path_image = r"D:\Courses language programming\6_Deep Learning\Image Caption Generator\images"
model = Xception(include_top = False, pooling = "avg")

def feature_extraction(path, model):
    features = {}
    for imageName in os.listdir(path):
        completa_path = image_file = path_image + "\\" + imageName
        
        image = Image.open(completa_path)

        image = image.resize((299, 299))
        image =  np.expand_dims(image, axis=0)

        image = image / 127.5
        image = image - 1

        feature = model.predict(image)
        features[imageName] = feature
        
    return features


In [7]:
feature = feature_extraction(path_image, model)

pickle.dump(feature, open(r"D:\Courses language programming\6_Deep Learning\Image Caption Generator\text-image caption\image_feature.bin", "wb"))


KeyboardInterrupt



In [8]:
feature = pickle.load(open(r"D:\Courses language programming\6_Deep Learning\Image Caption Generator\text-image caption\image_feature.bin", "rb"))

In [9]:
train_image_path = r"D:\Courses language programming\6_Deep Learning\Image Caption Generator\text-image caption\caption.trainImages.txt"
clean_token = r"D:\Courses language programming\6_Deep Learning\Image Caption Generator\text-image caption\cleaned_text.txt"
feature_path = r"D:\Courses language programming\6_Deep Learning\Image Caption Generator\text-image caption\image_feature.bin"

In [10]:
def load_image(path):
    data = Readfile(path)
    data = data.split("\n")
    
    return data

train_image = load_image(train_image_path)

def load_token(path, images):
    data = Readfile(path)
    lines = data.split("\n")
    
    token = {}
    for line in lines:
        image, caption = line.split("\t")
        if image in images:
            if image not in token:
                token[image] = []
            token[image].append("<start> " + caption + " <end>")
            
    return token
            
load_token = load_token(clean_token, train_image)

def load_feature(path, images):
    feature = pickle.load(open(path, "rb"))
    selected_features = [{image:feature[image] for image in images if image in feature}]
    
    return selected_features

train_feature = load_feature(feature_path, train_image)

In [11]:

def fetsh_Data(data):
    captions = []
    for caps in data.values():
        [captions.append(cap) for cap in caps]
        
    return captions
def create_tokenizer(captions):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(captions)
    return tokenizer

def longenst_captions(caption):
    return max(len(cap.split()) for cap in caption)

captions = fetsh_Data(load_token)
tokenizer = create_tokenizer(captions)
voc_size = len(tokenizer.word_index) + 1
max_cap = longenst_captions(captions)

In [14]:
def build_sequance(tokenizer, max_cap, voc_size, feature, caption):
    input_1, input_2, output = [], [], []
    for cap in captions:
        seq = tokenizer.texts_to_sequences([cap])[0]
        for index in range(len(seq)):
            in_seq = seq[:index] 
            in_seq = pad_sequences([in_seq], maxlen=max_cap, padding="post")
            output_w = seq[index]
            output_w = to_categorical([output_w], num_classes=voc_size)[0]
            input_1.append(feature)
            input_2.append(in_seq)
            output.append(output_w)
    return np.array(input_1), np.array(input_2), np.array(output)

def data_generator(tokenizer, feature, data, train_token, max_cap, voc_size):
    while True:
        for img, caption in data.items():
            if img in feature:
                f = feature[img][0]
                input_img, input_seq, output_word = build_sequance(tokenizer, max_cap, voc_size, f, caption)
                yield([[input_img, input_seq], output_word])
                

In [18]:
[[in_img, in_seq], out_word] = next(data_generator(tokenizer, feature, data, load_token, max_cap, voc_size))

In [39]:
in_img.shape, in_seq.shape, out_word.shape

feature_size = in_img.shape[1]

In [44]:
def Build_model(num_features, longest_caption, output_size):
    # CNN Model
    input_img = Input(shape=(num_features,))
    cnn_layer1 = Dropout(0.5)(input_img)
    cnn_layer2 = Dense(256, activation="relu")(cnn_layer1)
    
    # LSTM Model
    input_seq = Input(shape=(longest_caption,))
    lstm_layer1 = Embedding(output_size, 256, mask_zero=True)(input_seq)
    lstm_layer2 = Dropout(0.5)(lstm_layer1)
    lstm_layer3 = LSTM(256)(lstm_layer2)
    
    #Merging Model
    merging_model = add([cnn_layer2, lstm_layer3])
    final_model = Dense(256, activation="relu")(merging_model)
    
    output = Dense(output_size, activation="softmax")(final_model)
    
    model = Model(inputs = [input_img, input_seq], outputs=output)
    
    model.compile(loss="categorical_crossentopy", optimizer="adam")
    
    return model

In [45]:
model = Build_model(feature_size, max_cap, voc_size)

In [47]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_15 (InputLayer)       [(None, 33)]                 0         []                            
                                                                                                  
 input_14 (InputLayer)       [(None, 2048)]               0         []                            
                                                                                                  
 embedding_1 (Embedding)     (None, 33, 256)              1856512   ['input_15[0][0]']            
                                                                                                  
 dropout_7 (Dropout)         (None, 2048)                 0         ['input_14[0][0]']            
                                                                                            

In [48]:
# steps = len(load_token)

# for i in range(16):
#     generator = data_generator(tokenizer, feature, data, load_token, max_cap, voc_size)
#     model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)
#     model.save(r"D:\Courses language programming\6_Deep Learning\Image Caption Generator"+str(i)+".h5")

In [63]:
def image_feature(path, model):
    try:    
        image = Image.open(path)
        plt.imshow(image)
        plt.axis("off")
        image = image.resize((299, 299))
        image =  np.expand_dims(image, axis=0)
        image = image / 127.5
        image = image - 1.0
        imgfeature = model.predict(image)
        
        return imgfeature
    except:
        print("Cannot read image")
        return None

def get_word(index, tokenizer):
    return list(tokenizer.word_index)[index-1]

def generate_caption(model, tokenizer, imgFeature, longenst_cap):
    output_size = "start"
    for i in range(longenst_cap):
        seq = tokenizer.texts_to_sequences([output_size])[0]
        seq = pad_sequences([seq], maxlen=longenst_cap)
        
        predict = model.predict([imgFeature, seq])
        index = np.argmax(predict)
        word = get_word(index, tokenizer)
        
        if word == "end":
            break
        output_seq += " "+word
    return output_seq

In [64]:
pickle.dump(tokenizer, open(r"D:\Courses language programming\6_Deep Learning\Image Caption Generator\text-image caption\tokenizer", "wb"))

In [61]:
cnn_model = Xception(include_top=False, pooling="avg")

img_path = r""

cap_model = load_model(r"")

tokenizer = pickle.load(open(r"D:\Courses language programming\6_Deep Learning\Image Caption Generator\text-image caption\tokenizer", "rb"))

In [None]:
img_feature = image_feature(img_path, cnn_model)

In [None]:
generate_caption(cap_model, tokenizer, img_feature, max_cap)