 **Model Evaluation Matrix**

BLEU (Bilingual Evaluation Understudy)

BLEU, or the Bilingual Evaluation Understudy, is a score for comparing a candidate translation of text to one or more reference translations.

Although developed for translation, it can be used to evaluate text generated for a suite of natural language processing tasks.

BLEU measures the closeness of the machine translation to human reference translation taking translation length, word choice, and word order into consideration. It is used for machine translation, abstractive text summarization, image captioning and speech recognition

**Importing Necessary Libraries**

In [1]:
import tensorflow as tf
import keras
import sys, time, os, warnings
import numpy as np
import pandas as pd
from collections import Counter
warnings.filterwarnings("ignore")
print("python {}".format(sys.version))
print("keras version {}".format(keras.__version__));del keras
print("tensorflow version {}".format(tf.__version__))

# # configuring gpu notebook
config = tf.compat.v1.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.95
config.gpu_options.visible_device_list = "0"
tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))

def set_seed(sd=123):
    from numpy.random import seed
    from tensorflow import set_random_seed
    import random as rn

    ##numpy random seed
    seed(sd)
    ## core python's random number
    rn.seed(sd)
    ## tensor flow's random number 
    set_random_seed(sd)


python 3.9.4 (tags/v3.9.4:1f2e308, Apr  6 2021, 13:40:21) [MSC v.1928 64 bit (AMD64)]
keras version 2.5.0
tensorflow version 2.5.0-rc3


**Downloading and importing Dataset in Notebook**

In [2]:
## The location of the caption file
dir_Flickr_txt ="C:/Users/Omshree/Desktop/NLP/Flickr_Data/Flickr_Data/Flickr_TextData/Flickr8k.token.txt"

## The location of the Flickr8K_ Photos
dir_Flickr_jpg ="C:/Users/Omshree/Desktop/NLP/Flickr_Data/Flickr_Data/Images"


jpgs = os.listdir(dir_Flickr_jpg)
print("The number of jpg files in Flicker8K: {}".format(len(jpgs)))

The number of jpg files in Flicker8K: 8091


**Preliminary Analysis**

Importing Caption Data

Load the text data and save it into a panda dataframework df_txt

In [3]:
## read in the Flickr caption data
file = open(dir_Flickr_txt,'r')
text = file.read()
file.close()

datatxt = []
for line in text.split('\n'):
    col = line.split('\t')
    if len(col) == 1:
        continue
    w = col[0].split("#")
    datatxt.append(w + [col[1].lower()])
df_txt = pd.DataFrame(datatxt,columns=["filename","index","caption"])

uni_filenames = np.unique(df_txt.filename.values)
print("The number of unique file names : {}".format(len(uni_filenames)))
print("The distributation of numbers of captions for each image :")
Counter(Counter(df_txt.filename.values).values())

The number of unique file names : 8092
The distributation of numbers of captions for each image :


Counter({5: 8092})

**Example Picture with Captions**

In [None]:
from keras.preprocessing.image import load_img, img_to_array
import matplotlib.pyplot as plt

# Display (npic = 5) pic from dataset 
npic = 5
npix = 244
target_size = (npix,npix,3)

count = 1
fig = plt.figure(figsize=(10,20))
for jpgfnm in uni_filenames[:npic]:
    filename = dir_Flickr_jpg + '/' + jpgfnm
    captions = list(df_txt["caption"].loc[df_txt["filename"]==jpgfnm].values)
    image_load = load_img(filename, target_size=target_size)

    ax = fig.add_subplot(npic,2,count,xticks=[],yticks=[])
    ax.imshow(image_load)
    count += 1

    ax = fig.add_subplot(npic,2,count)
    plt.axis('off')
    ax.plot()
    ax.set_xlim(0,1)
    ax.set_ylim(0,len(captions))
    for i, caption in enumerate(captions):
        ax.text(0,i,caption,fontsize=20)
    count += 1
plt.show()

In [None]:
def df_word(df_txt):
    vocabulary = []
    for txt in df_txt.caption.values:
        vocabulary.extend(txt.split())
    print('Vocabulary Size: %d' % len(set(vocabulary)))
    ct = Counter(vocabulary)
    dfword = pd.DataFrame(list(ct.items()), columns=['word', 'count'])
    dfword.sort_values(by='count', ascending=False, inplace=True)
    dfword = dfword.reset_index()[["word", "count"]]
    return(dfword)
dfword = df_word(df_txt)
dfword.head(3)

In [None]:
topn = 50

def plthist(dfsub, title="The Top 50 Most Frequently Appearing Words"):
    plt.figure(figsize=(20,3))
    plt.bar(dfsub.index,dfsub["count"])
    plt.yticks(fontsize=20)
    plt.xticks(dfsub.index,dfsub["word"],rotation=90,fontsize=20)
    plt.title(title,fontsize=20)
    plt.show()

plthist(dfword.iloc[:topn,:], title="The Top 50 Most Frequently Appearing Words")
plthist(dfword.iloc[-topn:,:], title="The least 50 Most Frequently Appearing Words")

In [None]:
import string
text_original = " hi. I am Omshree !, my roll no is 08"
def remove_punctuation(text_original):
    text_no_punctuation = text_original.translate(str.maketrans('','',string.punctuation))
    return(text_no_punctuation)
text_no_punctuation = remove_punctuation(text_original)

def remove_single_character(text):
    text_len_more_than1 = ""
    for word in text.split():
        if len(word) > 1:
            text_len_more_than1 += " " + word
    return(text_len_more_than1)
text_len_more_than1 = remove_single_character(text_no_punctuation)

def remove_numeric(text,printTF=False):
    text_no_numeric = ""
    for word in text.split():
        isalpha = word.isalpha()
        if printTF:
            print(" {:10} : {:}".format(word, isalpha))
        if isalpha:
            text_no_numeric += " " + word
    return(text_no_numeric)
text_no_numeric = remove_numeric(text_len_more_than1,printTF=True)
print(text_no_numeric)

In [None]:
def text_clean(text_original):
    text = remove_punctuation(text_original)
    text = remove_single_character(text)
    text = remove_numeric(text)
    return(text)

for i, caption in enumerate(df_txt.caption.values):
    newcaption = text_clean(caption)
    df_txt["caption"].iloc[i] = newcaption

In [None]:
dfword = df_word(df_txt)
plthist(dfword.iloc[:topn,:], title="The top 50 frequently appearing words")
plthist(dfword.iloc[-topn:,:], title="The least 50 most frequently appearing words")

In [None]:
from copy import copy
def add_start_end_seq_token(captions):
    caps = []
    for txt in captions:
        txt = 'startseq ' + txt + ' endseq'
        caps.append(txt)
    return(caps)
df_txt0 = copy(df_txt)
df_txt0["caption"] = add_start_end_seq_token(df_txt["caption"])
df_txt0.head(5)
del df_txt

In [None]:
modelvgg = tf.keras.applications.VGG16(include_top=True, weights=None)
## Load the locally saved weights
modelvgg.load_weights("../input/vgg16-weights/vgg16_weights_tf_dim_ordering_tf_kernels.h5")
modelvgg.summary()

In [None]:
modelvgg.layers.pop()
modelvgg = tf.keras.Model(inputs=modelvgg.inputs, outputs=modelvgg.layers[-2].output)
## show the deep learning model
modelvgg.summary()

In [None]:
from keras.preprocessing.image import load_img, img_to_array
from keras.applications.vgg16 import preprocess_input
from collections import OrderedDict

images = OrderedDict()
npix = 224
target_size = (npix,npix,3)
data = np.zeros((len(jpgs),npix,npix,3))
for i, name in enumerate(jpgs):
    # Load on image from file
    filename = dir_Flickr_jpg + '/' + name
    image = load_img(filename, target_size=target_size)
    # convert the image pixels to a numpy array
    image = img_to_array(image)
    nimage = preprocess_input(image)

    y_pred = modelvgg.predict(nimage.reshape( (1,) + nimage.shape[:3]))
    images[name] = y_pred.flatten()

In [6]:
images['19212715_20476497a3.jpg'].shape

NameError: name 'images' is not defined

In [None]:
len(images)

In [None]:
dimages, keepindex = [],[]
df_txt0 = df_txt0.loc[df_txt0["index"].values == "0",: ]
for i, fnm in enumerate(df_txt0.filename):
    if fnm in images.keys():
        dimages.append(images[fnm])
        keepindex.append(i)

fnames = df_txt0["filename"].iloc[keepindex].values
dcaptions = df_txt0["caption"].iloc[keepindex].values
dimages = np.array(dimages)

**4096 features for all 8091 images**

In [None]:
dimages.shape

**list of captions for each image**

In [None]:
dcaptions[:5]

Tokenizer

the maximum number of words in dictionary

In [None]:

from keras.preprocessing.text import Tokenizer

nb_words = 8000
tokenizer = Tokenizer(nb_words=nb_words)
tokenizer.fit_on_texts(dcaptions)
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary size : {}".format(vocab_size))
dtexts = tokenizer.texts_to_sequences(dcaptions)

In [None]:
dtexts[:5]

**split the dataset in ratio 6:2:2 (train:valid:test)**

In [None]:
prop_test, prop_val = 0.2, 0.2
N = len(dtexts)
Ntest, Nval = int(N*prop_test), int(N*prop_val)

def split_test_val_train(dtexts, Ntest,Nval):
    return(dtexts[:Ntest],dtexts[Ntest:Ntest+Nval],dtexts[Ntest+Nval:])

dt_test, dt_val, dt_train = split_test_val_train(dtexts,Ntest,Nval)
di_test, di_val, di_train = split_test_val_train(dimages,Ntest,Nval)
fnm_test, fnm_val, fnm_train = split_test_val_train(fnames,Ntest,Nval)

In [None]:
maxlen = np.max([len(text) for text in dtexts])

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

def preprocessing(dtexts, dimages):
    N = len(dtexts)
    print("# captions/images = {}".format(N))

    assert(N==len(dimages))
    Xtext, Ximage, ytext = [],[],[]
    for text, image in zip(dtexts,dimages):
        for i in range(1,len(text)):
            in_text, out_text = text[:i],text[i]
            in_text = pad_sequences([in_text],maxlen=maxlen).flatten()
            out_text = to_categorical(out_text,num_classes = vocab_size)

            Xtext.append(in_text)
            Ximage.append(image)
            ytext.append(out_text)

    Xtext = np.array(Xtext)
    Ximage = np.array(Ximage)
    ytext = np.array(ytext)
    print(" {} {} {} ".format(Xtext.shape,Ximage.shape,ytext.shape))
    return(Xtext,Ximage,ytext)

Xtext_train, Ximage_train, ytext_train = preprocessing(dt_train,di_train)
Xtext_val, Ximage_val, ytext_val = preprocessing(dt_val,di_val)
# pre-processing is not necessary for testing data
#Xtext_test, Ximage_test, ytext_test = preprocessing(dt_test,di_test)

In [None]:
Xtext_train[:14]

In [86]:
Ximage_train.shape

(49631, 4096)

In [None]:
ytext_train.shape

** Model**

In [None]:

Ximage_train.shape[1]

In [None]:
from tensorflow.keras import layers
print(vocab_size)

dim_embedding = 64

input_image = layers.Input(shape=(Ximage_train.shape[1],))
fimage = layers.Dense(256,activation='relu',name='ImageFeature')(input_image)

## sequence model
input_txt = layers.Input(shape=(maxlen,))

## the embedding layer in keras can be used when we want to create the embeddings toembed higher dimensional data into lower 
ftxt = layers.Embedding(vocab_size,dim_embedding,mask_zero=True)(input_txt)
ftxt = layers.LSTM(256,name="CaptionFeature")(ftxt)

## combined model far decoder
decoder = layers.add([ftxt,fimage])
decoder = layers.Dense(256,activation='relu')(decoder)
output = layers.Dense(vocab_size,activation='softmax')(decoder)
model = tf.keras.Model(inputs=[input_image,input_txt],outputs=output)

model.compile(loss='categorical_crossentropy', optimizer='adam')

print(model.summary())

** here we are giving one image and a start-word and we want to predict the next word.
 the next word is in our target which is 75**

In [None]:
print(Ximage_train[0])
print(Xtext_train[0])
print(ytext_train[0][75])

In [None]:
start = time.time() 
hist = model.fit(x= [Ximage_train,Xtext_train],y= ytext_train, batch_size= 64, epochs=5, verbose=2,validation_split=0,validation_data=([Ximage_val, Xtext_val],ytext_val))

end = time.time()
print("Time Took {:3.2f}MIN".format((end - start)/60))

In [None]:
print(Ximage_train.shape,Xtext_train.shape,ytext_train.shape)

In [None]:
for label in ["loss","val_loss"]:
    plt.plot(hist.history[label],label=label)
plt.legend()
plt.xlabel("epochs")
plt.ylabel("loss")
plt.show()

In [None]:
index_word = dict([(index,word) for word, index in tokenizer.word_index.items()])
def predict_caption(image):
    '''
    image.shape = (1,4462)
    '''

    in_text = 'startsep'
    for iword in range(maxlen):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence],maxlen)
        yhat = model.predict([image,sequence],verbose=0)
        yhat = np.argmax(yhat)
        newword = index_word[yhat]
        in_text += " " + newword
        if newword == "endseq":
            break
    return(in_text)

npic = 5
npix = 224
target_size = (npix,npix,3)

count = 1
fig = plt.figure(figsize=(10,20))
for jpgfnm, image_feature in zip(fnm_test[:npic],di_test[:npic]):
    ## images
    filename = dir_Flickr_jpg + '/' + jpgfnm
    image_load = load_img(filename, target_size=target_size)
    ax = fig.add_subplot(npic,2,count,xticks=[],yticks=[])
    ax.imshow(image_load)
    count += 1

    ## caption
    caption = predict_caption(image_feature.reshape(1,len(image_feature)))
    ax = fig.add_subplot(npic,2,count)
    plt.axis('off')
    ax.plot()
    ax.set_xlim(0,1)
    ax.set_ylim(0,1)
    ax.text(0,0.5,caption,fontsize=28)
    count += 1

plt.show()

In [87]:
images['1000268201_693b08cb0e.jpg'].shape

(4096,)

**BLEU SCORE**

In [None]:
start = time.time()
from nltk.translate.bleu_score import sentence_bleu
index_word = dict([(index,word) for word, index in tokenizer.word_index.items()])

nkeep = 5
pred_good, pred_bad, bleus = [], [], []
couunt = 0
for jpgfnm, image_feature, tokenized_text in zip(fnm_test,di_test,dt_test):
    count += 1
    if count % 200 == 0:
        print(" {:4.2f} % is done....".format(100*count/float(len(fnm_test))))
    caption_true = [ index_word[i] for i in tokenized_text ]
    caption_true = caption_true[1:-1] ## remove startreg, and endreg
    ## captions
    caption = predict_caption(image_feature.reshape(1,len(image_feature)))
    caption = caption.split()
    caption = caption[1:-1] ## remove startreg and endreg
    
    bleu = sentence_bleu([caption_true],caption)
    bleus.append(bleu)
    if bleu > 0.6 and len(pred_good) < nkeep:
        pred_good.append((bleu,jpgfnm,caption_true,caption))
    elif bleu < 0.3 and len(pred_bad) < nkeep:
        pred_bad.append((bleu,jpgfnm,caption_true,caption))

end = time.time()
print((start - end)/60)

In [None]:
pred_good

In [None]:
pred_bad[:2]

In [None]:
print("Mean BLEU {:4.3f}".format(np.mean(bleus)))

In [None]:
def plot_images(pred_bad):
    def create_str(caption_true):
        strue = ""
        for s in caption_true:
            strue += " " + s
        return(strue)
    npix = 224
    taret_size = (npix,npix,3)
    count = 1
    fig = plt.figure(figsize=(10,20))
    npic = len(pred_bad)
    for pd in pred_bad:
        bleu,jpgfnm,caption_true,caption = pd
        ## image
        filename = dir_Flickr_jpg + '/' + jpgfnm
        image_load = load_img(filename, target_size=target_size)
        ax = fig.add_subplot(npic,2,count,xticks=[],yticks=[])
        ax.imshow(image_load)
        count += 1
        
        caption_true = create_str(caption_true)
        caption = create_str(caption)
        
        ax = fig.add_subplot(npic,2,count)
        plt.axis('off')
        ax.plot()
        ax.set_xlim(0,1)
        ax.set_ylim(0,1)
        ax.text(0,0.7,"true:" + caption_true,fontsize=20)
        ax.text(0,0.4,"pred:" + caption, fontsize=20)
        ax.text(0,0.1,"BLEU: {}".format(bleu),fontsize=20)
        count += 1
    plt.show()
    
print("Bad Caption")
plot_images(pred_bad)
print("Good Caption")
plot_images(pred_good)

**GUI**

In [None]:
#import libraries

import tkinter as tk
from PIL import Image, ImageTk
from tkinter import filedialog

In [4]:
import os

os.chdir('C:/Users/Omshree/Desktop/NLP')
os.getcwd()

'C:\\Users\\Omshree\\Desktop\\NLP'

In [5]:
def upload_img():
    global img, image_date
    for img_display in frame.winfo_children():
        img_display.destroy()
    
    image_data = filedialog.askopenfilename(initaldir="/", title="Choose an image", filetypes=(("all files","*.*"),("png files","*.png")))
    basewidth = 300
    img = Image.open(image_data)
    wpercent = (basewidth / float(img.size[0]))
    hsize = int((float(img.size[1]) * float(wpercent)))
    img = img.resize((basewidth, hsize), Image.ANTIALIAS)
    img = ImageTk.PhotoImage(img)
    file_name = image_data.split('/')
    panel = tk.Label(frame, text=str(file_name[len(file_name)-1]).upper()).pack()
    panel_image = tk.Label(frame, image=img).pack()
    
def caption():
    original = Image.open(image_data)
    original = original.resize((224,224), Image.ANTIALIAS)
    numpy_image = img_to_array(original)
    nimage = preprocess_input(numpy_image)
    
    feature = modelvgg.predict(nimage.reshape((1,) + nimage.shape[:3]))
    caption = predict_caption(feature)
    table = tk.Label(frame, text="Caption: " + caption[9:-7],font=("Helvetica",12)).pack()

In [88]:
root = tk.Tk()
root.title('IMAGE CAPTION NLP')
root.iconbitmap('class.ico')
root.resizable(False, False)
tit = tk.Label(root, text="IMAGE CAPTION GENERATOR", padx=25, pady=6, font=("",12)).pack()
canvas = tk.Canvas(root, height=500, width=600, bg='#D1EDf2')
canvas.pack()
frame = tk.Frame(root, bg='white')
frame.place(relwidth=0.8, relheight=0.8, relx=0.1, rely=0.1)
chose_image = tk.Button(root, text='Choose Image',padx=35,pady=10,fg="black",bg="pink",command=upload_img, activebackground="#add8e6")
chose_image.pack(side=tk.LEFT)

caption_image = tk.Button(root, text='Classify Image',padx=35, pady=10,fg="black", bg="pink", command=caption, activebackground="#add8e6")
caption_image.pack(side=tk.Right)
root.mainloop()

TclError: no display name and no $DISPLAY environment variable