### Imports

In [1]:
#tf.keras.applications.ResNet152
import tensorflow
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
import numpy as np
import time
import xml.etree.cElementTree as ET
import os

### Initialize Classification Model
Using pre-trained _ImageNet_ weights it is possible to classify images in 1000 categories

In [2]:
model = ResNet50(
    include_top=True,
    weights="imagenet",
    input_tensor=None,
    input_shape=None,
    pooling=None,
    classes=1000,
    classifier_activation="softmax",
)

### Functions to get ResNet predictions and create Captions

In [3]:
def get_caption(img_path, limit_result=5):
    #img_path = 'simple_images/Aerospace/Aerospace_1.jpeg'
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)

    preds = model.predict(x)
    # decode the results into a list of tuples (class, description, probability)
    # (one such list for each sample in the batch)
    labels = decode_predictions(preds, limit_result)[0]
    caption = ' '.join([el[1] for el in labels if el[2] > 0])
    return caption.replace('_', ' ')

### Create XML files

In [4]:
def create_XML_file(id, topic, caption, img_dir, doc_folder='./xmldocs'):
    root = ET.Element('root')
    doc = ET.SubElement(root, 'doc')

    #Incremental ID
    ET.SubElement(doc, 'DOCID').text = str(id)
    #Name of the folder
    ET.SubElement(doc, 'HEADLINE').text = topic
    #Caption generated
    ET.SubElement(doc, 'TEXT').text = caption
    #Complete img path
    ET.SubElement(doc, 'IMGDIR').text = img_dir

    tree = ET.ElementTree(root)
    tree.write(doc_folder + '/' + str(id) + '.xml')

### Generate XML docs for the entire images collection

In [5]:
img_folder = './simple_images'

def produce_xml_docs(img_folder=img_folder):
    id = 0
    folder_list = os.listdir(img_folder)
    for folder in folder_list:
        for file in os.listdir(img_folder + '/' + folder):
            img_dir = img_folder + '/' + folder + '/' + file
            caption = get_caption(img_dir)
            id += 1
            create_XML_file(id, folder, caption, img_dir)
    #return image_path_list


In [6]:
start = time.time()
produce_xml_docs()
end = time.time()

print('xml caption documents created in ' + str(round(end-start, 4)) + 's')



xml caption documents created in 234.6354s
