In [2]:
#importing libraries   
import pandas as pd
import numpy as np
import re
import cv2 
import os
import glob
import json
import tensorflow as tf
import matplotlib.pyplot as plt
from datasets import Dataset, concatenate_datasets

  from .autonotebook import tqdm as notebook_tqdm


### The Below Cell was used to downlaod data from hugging face ( required for first time only )

In [2]:
#importing Images from huggingface sbu_Captions dataset
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import io
import urllib

import PIL.Image

from datasets import load_dataset
from datasets.utils.file_utils import get_datasets_user_agent


USER_AGENT = get_datasets_user_agent()


def fetch_single_image(image_url, timeout=None, retries=0):
    for _ in range(retries + 1):
        try:
            request = urllib.request.Request(
                image_url,
                data=None,
                headers={"user-agent": USER_AGENT},
            )
            with urllib.request.urlopen(request, timeout=timeout) as req:
                image = PIL.Image.open(io.BytesIO(req.read()))
            break
        except Exception:
            image = None
    return image


def fetch_images(batch, num_threads, timeout=None, retries=0):
    fetch_single_image_with_args = partial(fetch_single_image, timeout=timeout, retries=retries)
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        batch["image"] = list(executor.map(fetch_single_image_with_args, batch["image_url"]))
    return batch


num_threads = 12

# this load_dataset loads only 10% of the dataset
dset = load_dataset("sbu_captions", split='train[:5%]')

# the data has only image url and not image so we'll download it and stgore it 
dset = dset.map(fetch_images, batched=True, batch_size=100, fn_kwargs={"num_threads": num_threads})
dset.save_to_disk("data/")

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 50000/50000 [29:40<00:00, 28.08 examples/s]


### Loading the downloaded dataset

In [2]:
from datasets import Dataset, concatenate_datasets

  from .autonotebook import tqdm as notebook_tqdm


#### Combining the data and splitting it for train and test

In [91]:
file_list = glob.glob('data\*.arrow')
final_dataset = Dataset.from_file(file_list[0])

print(len(final_dataset))

for i in range(1, len(file_list)):
    temp = Dataset.from_file(file_list[i])
    temp2 = concatenate_datasets([final_dataset, temp])
    final_dataset = temp2

    print('INSIDE FOR LOOP', len(final_dataset))

    
final_dataset = final_dataset.remove_columns(['user_id', 'image_url'])

## Splitting the dataset into train and test and saving it 

# final_dataset = final_dataset.train_test_split(test_size=0.2)
# final_dataset['train'].save_to_disk('final_data/train', num_shards=1)
# final_dataset['test'].save_to_disk('final_data/test', num_shards=1)

16667
INSIDE FOR LOOP 33334
INSIDE FOR LOOP 50000


### creating train & test folder with images and their caption stored in a text file

In [3]:
temp = Dataset.from_file('finaldata\\train\\data-00000-of-00001.arrow')
print(temp.features)


from PIL import Image

c = 0

def save(row):
    print(row)
    global c
    if row['image'] is not None:
        row['image'].save(f'artifacts\/train\img\img_{c}.jpeg')
        with open('artifacts\/train\caption.txt', 'a') as f:
            f.write(f"img_{c}.jpg, {row['caption']}")
            f.write('\n')

        c += 1

temp.map(save)


{'caption': Value(dtype='string', id=None),
 'image': Image(decode=True, id=None)}

## Loading Train Data and processing the captions

In [4]:

# Preprocess the caption, splitting the string and adding <start> and <end> tokens
def get_preprocessed_caption(caption): 
    res = []   
    captions = re.sub(r'\s+', ' ', caption['caption'])
    captions = captions.strip()
    captions = "<start> " + captions + " <end>"
    caption['caption'] = captions
    return caption


updated_dataset = temp.map(get_preprocessed_caption).with_format('torch')


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map: 100%|██████████| 40000/40000 [00:03<00:00, 10184.04 examples/s]


In [5]:
top_k = 5000 # Take maximum of words out of 7600
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k,
                                                  oov_token="<unk>",
                                                  filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')

# Generate vocabulary from train captions
tokenizer.fit_on_texts(updated_dataset['caption'])

# Introduce padding to make the captions of the same size for the LSTM model
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

# Create the tokenized vectors
y_train = tokenizer.texts_to_sequences(updated_dataset['caption'])

# Add padding to each vector to the max_length of the captions (automatically done)
y_train = tf.keras.preprocessing.sequence.pad_sequences(y_train, padding='post')

In [32]:
len(y_train[0])

74

In [15]:
import pickle

# with open('artifacts\\tokenizer.pickle', 'wb') as f:
#     pickle.dump(tokenizer, f)


with open('artifacts\\tokenizer.pickle', 'rb') as f: 
    t = pickle.load(f)


In [19]:
updated_dataset[10]['caption']

'<start> pelicans flying over a rock <end>'

In [33]:
x = t.texts_to_sequences(updated_dataset[10]['caption'])
x = tf.keras.preprocessing.sequence.pad_sequences(x, padding='post', maxlen=74)
len(x)

41

In [11]:
caption_dir = 'data\/test\caption.txt'
x = 0
with open(caption_dir, 'r', encoding='utf-8') as file: 
    for i, line in enumerate(file):
        img_name = line.split(',')[0]
        cap = line.split(',')[1].strip()
        x += 1
        print(img_name, '-------', cap)
        if x == 5:
            break
        

img_1.jpg ------- We'd empty out of the house every morning around 8
img_2.jpg ------- desk in upstairs bigger bedroom
img_3.jpg ------- Somewhere on the floor of a bar in &amp;quot;rue l'Olive&amp;quot;
img_4.jpg ------- Obviously the blue roof does not work in the composition.
img_5.jpg ------- Looking at a bridge across Cutler Lake. The reflection of the sky in the lake caught my eye.


In [29]:
import os
import glob
import pandas as pd
import torch
from torchvision.io import read_image
from torch.utils.data import DataLoader
from datasets import concatenate_datasets
from torchvision import transforms

def preprocessing_transforms():
    return transforms.Compose([
        transforms.Resize((256,256)),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

def denormalize(image: torch.Tensor):
    inv_normalize = transforms.Normalize(
        mean=[-0.485 / 0.229, -0.456 / 0.224, -0.406 / 0.225],
        std=[1 / 0.229, 1 / 0.224, 1 / 0.225]
    )
    return (inv_normalize(image) * 255.).type(torch.uint8).permute(1, 2, 0).numpy()
    



class CustomImageDataset():
    def __init__(self, image_dir, caption_dir, transform=preprocessing_transforms()):

        self.transform = transform

        with open(caption_dir, 'r', encoding='utf-8') as file: 
            for i, line in enumerate(file):
                
        

    def __len__(self):
        return len(self.final_dataset)

    def __getitem__(self, idx):
        sample = self.final_dataset[idx]
        # return sample['image'], self.y_train[idx]
        return sample['image']
    
    def get_preprocessed_caption(self, caption):   
        captions = re.sub(r'\s+', ' ', caption['caption'])
        captions = captions.strip()
        captions = "<start> " + captions + " <end>"
        caption['caption'] = captions
        return caption

In [30]:
from torch.utils.data import DataLoader

training_data = CustomImageDataset('finaldata\\test\\data-00000-of-00001.arrow')


# train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)

In [32]:
training_data

<__main__.CustomImageDataset at 0x19aaff00160>

In [33]:
train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x19aaff5baf0>

In [34]:
train_features, train_labels = next(iter(train_dataloader))

# for i, batch in enumerate(train_dataloader):
#     print(batch)

TypeError: expected Tensor as element 5 in argument 0, but got NoneType

In [12]:
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions


model = tf.keras.applications.ResNet50(
    include_top=False,
    weights="imagenet",
    input_shape=(224, 224, 3)
)

x = tf.keras.layers.Resizing(224,224)(dataset[0]['image'])
x = tf.expand_dims(x, axis=0)
x = preprocess_input(x)

model(x)


In [25]:
# tf.reshape(dataset[0]['image'], (224,224,3))



<tf.Tensor: shape=(1, 7, 7, 2048), dtype=float32, numpy=
array([[[[0.        , 0.        , 0.        , ..., 0.        ,
          0.        , 0.        ],
         [0.        , 0.        , 0.        , ..., 0.        ,
          0.        , 0.        ],
         [0.        , 0.        , 0.        , ..., 0.        ,
          0.        , 0.        ],
         ...,
         [0.        , 0.        , 0.        , ..., 0.        ,
          0.        , 0.        ],
         [0.        , 0.        , 0.        , ..., 0.        ,
          0.        , 0.        ],
         [0.        , 0.        , 0.        , ..., 0.        ,
          0.        , 0.        ]],

        [[0.        , 0.        , 0.        , ..., 0.        ,
          0.        , 0.        ],
         [2.5716877 , 0.        , 0.        , ..., 0.        ,
          0.        , 0.        ],
         [0.        , 0.        , 0.        , ..., 0.        ,
          0.        , 0.        ],
         ...,
         [0.        , 0.       