Importing libraries

In [38]:
import json as js
from collections import defaultdict
import string
from keras import layers #type : ignore
import nltk
from tqdm import tqdm

loading paths

In [39]:
with open('Config_RNN.json','r') as file:
    paths = js.load(file)

Loading initial train data

In [40]:
with open(paths["Caption_Train"],'r') as file:
    train_data = js.load(file)

Extracting imp data

In [41]:
newdict = {}
newdict["img_data"] = train_data["images"]
newdict["annotations_data"] = train_data["annotations"]

Creating Corpus

In [46]:
img_data = newdict["img_data"]
annotations_data = newdict["annotations_data"]
captions_dict = defaultdict(list)

for ann in annotations_data:
    captions_dict[ann["image_id"]].append(ann["caption"])

image_caption_data = {}
for img in img_data:
    file_id = img["file_name"]
    img_id = img["id"]
    image_caption_data[file_id] = captions_dict[img_id]

with open(paths["Corpus"], "w") as f:
    js.dump(image_caption_data, f, indent=4)


loading corpus

In [4]:
with open(paths["Corpus"], "r") as f:
    Corpus_data = js.load(f)

Loading nltk dependency

In [10]:
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/utkarsh/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/utkarsh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

Text Tokenization

In [16]:
pos_tagged_captions = {}
noun_captions = {}

for item in tqdm(Corpus_data):
    tagged = []
    for caption in item['captions']:
        tokens = nltk.word_tokenize(caption)
        pos_tags = nltk.pos_tag(tokens)
        tagged.append(pos_tags)
    pos_tagged_captions[item['file_name']] = tagged

for image_id, tagged_captions in tqdm(pos_tagged_captions.items()):
    noun_lists = []
    for tagged in tagged_captions:
        nouns = [word for word, tag in tagged if tag in ('NN', 'NNS', 'NNP', 'NNPS')]
        noun_lists.append(nouns)
    noun_captions[image_id] = noun_lists

100%|██████████| 118287/118287 [02:28<00:00, 794.38it/s]
100%|██████████| 118287/118287 [00:02<00:00, 47934.03it/s]


storing noune tags

In [33]:
with open(paths['Nouned_Corpus'],'w') as f:
    js.dump(noun_captions,f)

In [51]:
with open(paths["Corpus"], 'r') as f1:
    captions_data = js.load(f1)

with open(paths['Nouned_Corpus'], 'r') as f2:
    features_data = js.load(f2)

dataset = []

for image_id,nouns in tqdm(features_data.items()): 
    noun_caption = zip(nouns, captions_data[image_id])
    for noun, caption in noun_caption:
        dataset.append({
            "input": " ".join(noun),
            "output": caption
        })

with open(paths["Preprocessed_data"], "w") as out_file:
    js.dump(dataset, out_file, indent=2)

100%|██████████| 118287/118287 [00:00<00:00, 379465.09it/s]


Text Vectorization

In [9]:
with open(paths["Corpus"],'r') as file:
    data = js.load(file)

def clean_caption(caption):
    caption = caption.lower()
    caption = caption.translate(str.maketrans('', '', string.punctuation))
    caption = caption.strip()
    return caption

caption_dict = {}

for item in data:
    captions = [clean_caption(c) for c in item['captions']]
    caption_dict[item['file_name']] = captions

all_captions = []

for captions in caption_dict.values():
    for caption in captions:
        all_captions.append(f'startseq {caption} endseq')

vectorizer = layers.TextVectorization(
    max_tokens=10000,
    output_mode='int',
    output_sequence_length=30 
)

vectorizer.adapt(all_captions)
vocab = vectorizer.get_vocabulary()
vocab_size = len(vocab)

with open(paths["Preprocessed_Corpus"], "w") as f:
    js.dump(vocab, f)

print("Vocabulary Size:", vocab_size)

Vocabulary Size: 10000
