# Data Preprocessing 

1. Download the data.
2. Process / Prepare the data.

    *   Image Data Preprocessing 
    *   Text Data Preprocessing 

3. Save the processed dataset 

  *   Create treaning and testing datasets 
  *   Save treaning and testing data as npz( compressed numpy files )






## Step 1: Downloading the data

In [None]:
# Imports 

## For uploading dataset 

import os 
from google.colab import drive
from google.colab import files

## for CNN_Encoder (Feature extrator model)

from pickle import dump
from os import listdir
import string
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.models import Model


from numpy import array
from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

## 

In [None]:
###################### Mounting the drive ##############################

drive.mount('/content/gdrive')

####################### Uploading the kaggle API key ###################

files.upload() #this will prompt you to update the json

!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json  # set permission

####################### Downloading the dataset ########################

!kaggle datasets download -d shadabhussain/flickr8k

os.chdir('/content')  #change dir
!mkdir Flicker8k_Dataset  #create a directory 
!unzip -q flickr8k.zip -d Flicker8k_Dataset

os.listdir('/content/Flicker8k_Dataset/')


Mounted at /content/gdrive


Saving kaggle.json to kaggle.json
kaggle.json
Downloading flickr8k.zip to /content
 99% 2.10G/2.13G [00:21<00:00, 84.3MB/s]
100% 2.13G/2.13G [00:21<00:00, 107MB/s] 


['model_weights.h5', 'train_encoded_images.p', 'flickr_data', 'Flickr_Data']

## Step 2: Preparing and Processing the data

### Image Preprocessing 

In [None]:

def process_img(filename):
      """

      Function to load and process the image file,

      Image needs to be modified and preprocessed before feeding it to the
      pretrained model for VGG16 the input image dimentions should be (224, 224) 
      and needs to be processed accordingly by using preprocess_input from
      tensorflow.keras.applications.vgg16 

      INPUT : Image File path 
      Output : processed Image 


      """

      image = load_img(filename, target_size=(224, 224))
      image = img_to_array(image)  # convert the image pixels to a numpy array
      image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))  # reshape data for the model
      image = preprocess_input(image)  # prepare the image for the VGG model

      return image 



In [None]:

def extract_features(directory):

      """
      Function to extract features from an image using Pretrained Model
      Here we are using VGG16 model 

      Input : Path to image directory 
      Output : A dict =  { key -->  Image_ID 
                           value -->  Extracted Features  
                           }

      """

      # Prepare the model 
      model = VGG16()
      model.layers.pop()
      model = Model(inputs=model.inputs, 
                    outputs=model.layers[-1].output)
      print(model.summary())

      # extract features from each photo
      features = dict()

      for name in listdir(directory):
            image_id = name.split('.')[0]
            filename = directory + '/' + name
            image = process_img(filename)
            feature = model.predict(image, verbose=0)
            features[image_id] = feature # store feature
            break
        
      return features


In [None]:
########    extract features from all images

directory = '/content/Flicker8k_Dataset/Flickr_Data/Flickr_Data/Images'
features = extract_features(directory)
#print('Extracted Features: %d' % len(features))

# save to file
dump(features, open('features.pkl', 'wb'))

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     14758

### Text Preprocessing

####  extract descriptions for images

In [None]:


def load_and_clean_descriptions(filename):

      """
      Function to Load and Clean the descriptions of the images 

      Input : Path to the raw Flickr8k.token.txt file 

      Output : A Dictonary = {
                              key -->  Image_ID 
                              value --> A list,  containg descriptions of image 
                              }
      """

      # Load the file 
      file = open(filename, 'r')
      doc = file.read()
      file.close()  

      mapping = dict()

      table = str.maketrans('', '', string.punctuation) # To remove punctuation from each token


      for line in doc.split('\n'):

            tokens = line.split()
            if len(tokens)>2 :          
                image_id, desc = tokens[0], tokens[1:] #first token is the image id, the rest is the description
                image_id = image_id.split('.')[0] # sample Image path :: 1000268201_693b08cb0e.jpg

                ## Clean the Description

                desc = [word.lower() for word in desc]
                desc = [w.translate(table) for w in desc]
                desc = [word for word in desc if len(word)>1] # removing short words 
                image_desc = ' '.join(desc) # making it a string again 

                 

                if image_id not in mapping:
                      mapping[image_id] = list()

                mapping[image_id].append(image_desc)

      return mapping


In [None]:

def to_vocabulary(descriptions):
      """
      Function to create a vocabulary of words from the avalable text data 
      
      Input :  : A Dictonary = {
                                  key -->  Image_ID 
                                  value --> A list,  containg descriptions of image 
                                  }
      
      Output : A Set containg all unique words in the descriptions of all images 
      
      """
      # build a list of all description strings
      all_desc = set()
      for key in descriptions.keys():
          [all_desc.update(d.split()) for d in descriptions[key]]

      return all_desc
 


# 
def save_descriptions(descriptions, filename):

      """
      Function to save descriptions to file, one per line 
      
      Input : 1. Dictonary = {
                                  key -->  Image_ID 
                                  value --> A list,  containg descriptions of image 
                                  }
              2. Path of the file to be saved 
      
      Output : None 
      
      """
      lines = list()
      for key, desc_list in descriptions.items():
            for desc in desc_list:
                  lines.append(key + ' ' + desc)

      data = '\n'.join(lines)
      file = open(filename, 'w')
      file.write(data)
      file.close()
 

In [None]:

filename = '/content/Flicker8k_Dataset/Flickr_Data/Flickr_Data/Flickr_TextData/Flickr8k.token.txt'
 
# parse descriptions
descriptions = load_and_clean_descriptions(filename)
print('Loaded: %d ' % len(descriptions))


# summarize vocabulary
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))


# save to file
save_descriptions(descriptions, 'descriptions.txt')




Loaded: 8092 
Vocabulary Size: 8808


In [None]:
!cp "/content/descriptions.txt" "/content/gdrive/MyDrive/Imagecap"

## Step 3 : Saving the processed dataset 


### A. Create treaning and testing datasets 

In [None]:

def load_set(filename):

      """
      function to get a total image IDs 

      Input : Path to text file containing Image IDs

      Output: A set of all image IDs given in the file 

      """
 
      file = open(filename, 'r')
      doc = file.read()
      file.close()
    
      dataset = list()
      for line in doc.split('\n'): # process line by line
        
          if len(line) < 1:# skip empty lines
              continue
          identifier = line.split('.')[0] # get image ID
          dataset.append(identifier)

      return set(dataset)


 
 
def load_clean_descriptions(filename, dataset):

      """
      Function to load and prepare the descriptions to feed to RNN 
      i.e adding start and end tag to each sentence 

      Inputs : 1. Path to file storing the preprocessed descriptions 
               2. A Set of image IDs whose description is to be loaded 
      
      Output : A Dictonary = {
                              key -->  Image_ID 
                              value --> A list,  containg descriptions of image 
                              }
      """
 
      file = open(filename, 'r')
      doc = file.read()
      file.close()
    
      descriptions = dict()
      for line in doc.split('\n'):

          tokens = line.split() # split line by white space
          image_id, image_desc = tokens[0], tokens[1:] # get id and description  

          if image_id in dataset: # This is done to skip images not in the dataset         
              if image_id not in descriptions: # create list
                  descriptions[image_id] = list()       
              desc = 'startseq ' + ' '.join(image_desc) + ' endseq' # wrap description in tokens
              descriptions[image_id].append(desc)
      
      return descriptions



In [None]:


def load_img_features(filename, dataset):

      """
      Function to  load image features of the images in the given set of image IDs 
      
      """
      
      all_features = load(open(filename, 'rb'))# load all features
      features = {k: all_features[k] for k in dataset}# filter features

      return features
 

def to_lines(descriptions):

      """
      Function to covert a dictionary of clean descriptions to a list of descriptions
      
      """
      all_desc = list()
      for key in descriptions.keys():
          [all_desc.append(d) for d in descriptions[key]]

      return all_desc
 
# 
def create_tokenizer(descriptions):
      """

      function to create, fit  a tokenizer given caption descriptions

      """
      lines = to_lines(descriptions)
      tokenizer = Tokenizer()
      tokenizer.fit_on_texts(lines)

      return tokenizer
 

def max_length(descriptions):
      """
      
      function to calculate the length of the longest description  

      """
      lines = to_lines(descriptions)
      
      return max(len(d.split()) for d in lines)
 

In [None]:

def create_sequences(tokenizer, max_length, descriptions, img_features ):

      """
      
      Function to create sequences of images, input sequences and output words for an image

      Input :  1. tokenizer --> To convert sentences into a list of words 
               2. max_length --> To pad the input sequence till max_length 
               3. descriptions --> To create input sequence to output word pairs 
               4. img_features --> To create image input 


      """
      X1, X2, y = list(), list(), list()

      for key, desc_list in descriptions.items(): # walk through each image identifier

          for desc in desc_list: # walk through each description for the image

                seq = tokenizer.texts_to_sequences([desc])[0] # encode the sequence
                for i in range(1, len(seq)): # split one sequence into multiple X,y pairs

                    in_seq, out_seq = seq[:i], seq[i] # split into input and output pair                  
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0] # pad input sequence                 
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0] # encode output sequence               


                    X1.append(img_features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)

      return array(X1), array(X2), array(y)

####Training Data 

In [None]:
descriptions_path = '/content/gdrive/My Drive/Imagecap/descriptions.txt'
featurs_path = '/content/gdrive/My Drive/Imagecap/features.pkl'

In [None]:
# train dataset

Train_img_IDs = '/content/Flicker8k_Dataset/Flickr_Data/Flickr_Data/Flickr_TextData/Flickr_8k.trainImages.txt'

# creating a set of Treaning image IDs 
train = load_set(Train_img_IDs) 
print('Dataset: %d' % len(train))


# Create Train descriptions
train_descriptions = load_clean_descriptions(descriptions_path, train) 
print('Descriptions: train=%d' % len(train_descriptions))
 

# Create Train features
train_features = load_img_features(featurs_path, train)
print('Photos: train=%d' % len(train_features))


# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
dump(tokenizer, open('/content/gdrive/My Drive/Imagecap/tokenizer.pkl', 'wb'))
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)


# determine the maximum sequence length
max_length = max_length(train_descriptions)
print('Description Length: %d' % max_length)


# prepare sequences
X1train, X2train, ytrain = create_sequences(tokenizer, max_length, train_descriptions, train_features)


Dataset: 6000
Descriptions: train=6000
Photos: train=6000
Vocabulary Size: 7614
Description Length: 34


#### Testing Data 

In [None]:
 
# dev dataset
 
Test_img_IDs = '/content/Flicker8k_Dataset/Flickr_Data/Flickr_Data/Flickr_TextData/Flickr_8k.devImages.txt'


# creating a set of Tresting image IDs
test = load_set(Test_img_IDs)
print('Dataset: %d' % len(test))


# Create test descriptions
test_descriptions = load_clean_descriptions( descriptions_path , test)
print('Descriptions: test=%d' % len(test_descriptions))


# create test features
test_features = load_img_features( featurs_path , test)
print('Photos: test=%d' % len(test_features))


# prepare sequences
X1test, X2test, ytest = create_sequences(tokenizer, max_length, test_descriptions, test_features)
 

Dataset: 1000
Descriptions: test=1000
Photos: test=1000


### B. Save treaning and testing data as npz( compressed numpy files )

In [None]:
type(X1train)

numpy.ndarray

In [None]:
X1train.shape[0]

306455

In [None]:
X1train.shape[0] == X2train.shape[0]

True

In [None]:
ytrain.shape[0]

306455

In [None]:
X1train.shape[0] == X2train.shape[0] == ytrain.shape[0]

True

In [None]:
X1test.shape[0] == X2test.shape[0] == ytest.shape[0]

True

In [None]:
base_dir = "/content/gdrive/MyDrive/Imagecap/"

In [None]:
from numpy import savez_compressed 

savez_compressed(base_dir + 'X1train.npz', X1train)
savez_compressed(base_dir + 'X2train.npz', X2train)
savez_compressed(base_dir + 'ytrain.npz', ytrain)

In [None]:
savez_compressed(base_dir + 'X1test.npz', X1test)
savez_compressed(base_dir + 'X2test.npz', X2test)
savez_compressed(base_dir + 'ytest.npz', ytest)