In [None]:
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.resnet50 import ResNet50,preprocess_input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import *
import matplotlib.pyplot as plt
import numpy as np
import pickle
import os
import shutil
import json
import re
from collections import Counter

### downloading flickr8k datset from kaggle using api_key

In [None]:
!mkdir /root/.kaggle

In [None]:
token = {"username":"" ,"key":""}
with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(token, file)

In [None]:
!cp /.kaggle/kaggle.json ~/.kaggle/kaggle.json

In [None]:
!kaggle config set -n path -v{/content}

- path is now set to: {/content}


In [None]:
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d alessiocorrado99/animals10

Downloading flickr8k.zip to {/content}/datasets/shadabhussain/flickr8k
100% 2.12G/2.13G [00:48<00:00, 48.7MB/s]
100% 2.13G/2.13G [00:48<00:00, 47.0MB/s]


In [None]:
!unzip /content/flickr8k.zip

In [None]:
img_path='/content/flickr_data/Flickr_Data/Images'

In [None]:
os.listdir(img_path)[5]

'540604040_bec822c144.jpg'

### Preprocess Captions

In [None]:
all_caption={}
with open('flickr_data/Flickr_Data/Flickr_TextData/Flickr8k.token.txt') as f:
    data=f.read().split('\n')[:-1]
    for i in data:
        key,value=i.split('\t')
        key=key.split('#')[0]
        if all_caption.get(key) is None:
            all_caption[key]=[]
        all_caption[key].append(value)

In [None]:
print(len(all_caption))

8092


### cleaning data

In [None]:
def clean_data(data):
    for k,v in data.items():
        for i in range(len(v)):
            v[i]=v[i].lower()
            v[i]=re.sub('[^a-z]+',' ',v[i])
            v[i]='startseq '+' '.join([w for w in v[i].split() if len(w)>1])+' endseq'
        data[k]=v
    return data

In [None]:
all_caption=clean_data(all_caption)

In [None]:
all_caption['1084104085_3b06223afe.jpg']

['startseq woman in yellow shirt and green shorts is climbing an artificial red wall whilst wearing safety harness endseq',
 'startseq woman in blue shorts and white shirt is indoor rock climbing endseq',
 'startseq the girl is climbing rock wall endseq',
 'startseq the girl is in harness climbing rocks endseq',
 'startseq woman climbing an artificial rock wall endseq']

### creating vocab

In [None]:
words=[]
for i in all_caption.values():
    for sents in i:
        [words.append(w) for w in sents.split()]
words=dict(Counter(words))
print(len(words))

8426


In [None]:
words=sorted(words.items(),key=lambda x:x[1],reverse=True)

In [None]:
word_list=[]
for i in words:
    if i[1]>10:
        word_list.append(i[0])
print(len(word_list))

1847


### Mapping between words and indexes

In [None]:
#we will start indexing from 1 because 0 index is reserved for padding the sentences
word_to_index={}
index_to_word={}
for i in enumerate(word_list):
    word_to_index[i[1]]=i[0]+1
    index_to_word[i[0]+1]=i[1]

In [None]:
word_to_index['the']

4

In [None]:
index_to_word[4]

'the'

### Creating Train and validation data

In [None]:
train=[]
val=[]
with open('flickr_data/Flickr_Data/Flickr_TextData/Flickr_8k.trainImages.txt') as f:
    train=f.read().split('\n')[:-1]
with open('flickr_data/Flickr_Data/Flickr_TextData/Flickr_8k.devImages.txt') as f:
    val=f.read().split('\n')[:-1]

In [None]:
def caption(data):
    img_cap={}
    for i in data:
        if i in all_caption:
            img_cap[i]=all_caption[i]
    return img_cap

In [None]:
train_description=caption(train)
val_description=caption(val)

In [None]:
print(len(train_description),len(val_description))

6000 1000


In [None]:
train_description['1007129816_e794419615.jpg']

['startseq man in an orange hat starring at something endseq',
 'startseq man wears an orange hat and glasses endseq',
 'startseq man with gauges and glasses is wearing blitz hat endseq',
 'startseq man with glasses is wearing beer can crocheted hat endseq',
 'startseq the man with pierced ears is wearing glasses and an orange hat endseq']

### saving all the preprocessed text file for future preprocessing

In [None]:
with open('train_description.pkl','wb') as f:
    pickle.dump(train_description,f)
with open('val_description.pkl','wb') as f:
    pickle.dump(val_description,f)
with open('word_to_index.pkl','wb') as f:
    pickle.dump(word_to_index,f)
with open('index_to_word.pkl','wb') as f:
    pickle.dump(index_to_word,f) 

### loading saved files

In [None]:
with open('train_description.pkl','rb') as f:
    train_description=pickle.load(f)
with open('val_description.pkl','rb') as f:
    val_description=pickle.load(f)
with open('word_to_index.pkl','rb') as f:
    word_to_index=pickle.load(f)
with open('index_to_word.pkl','rb') as f:
    index_to_word=pickle.load(f)

### Creating image vector using ResNet-50 model

In [None]:
base_model=ResNet50(weights='imagenet')
base_model.summary()

In [None]:
def preprocess(img_list,name):
    model = Model(inputs=base_model.input, outputs=base_model.get_layer('avg_pool').output)
    di={}
    l=len(img_list)
    for i in range(l):
        print(i+1,'/',l)
        j=img_list[i]
        img = image.load_img(os.path.join(img_path,j),target_size=(224,224))
        x = image.img_to_array(img,dtype='uint8')
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)
        preds = model.predict(x)
        di[j]=preds.flatten()
    print(di[j].shape)
    with open('encoding_'+name+'.pkl','wb') as f:
        pickle.dump(di,f)

In [None]:
train=[k for k in train_description]
val=[k for k in val_description]

In [None]:
preprocess(train,'train')

In [None]:
preprocess(val,'val')

### creating embedding matrix

In [None]:
with open('glove.6B.50d.txt','r') as f:
    all_embeddings=f.read()
all_em_dict={}
for i in all_embeddings.split('\n')[:-1]:
    j=i.split()
    word=j[0]
    vector=np.asarray(j[1:],dtype='float32')
    all_em_dict[word]=vector

In [None]:
len(all_em_dict)

400000

In [None]:
def create_embedding(word_to_index):
    embedding_idx=np.zeros((len(word_to_index)+1,50))
    for word in word_to_index:
    if(all_em_dict.get(word) is not None):
        embedding_idx[word_to_index[word]]=all_em_dict[word]
    return embedding_idx

In [None]:
embedding_idx=create_embedding(word_to_index)

In [None]:
len(embedding_idx)

1848

In [None]:
np.save('embedding_idx.npy',embedding_idx,allow_pickle=True)