In [1]:
import os
from google.colab import drive

drive.mount('/content/gdrive')

os.chdir('/content/gdrive/My Drive/Data Science/tinkoff/project')
text_data_path = 'data | only text'

train_img_path = 'images/train/'
val_img_path = 'images/val/'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Data loading

In [2]:
import pandas as pd
from PIL import Image
%pylab inline


joint_path = os.path.join(text_data_path, 'joint_full.csv')
joint_df = pd.read_csv(joint_path)
joint_df.head()

Populating the interactive namespace from numpy and matplotlib


Unnamed: 0,image_id,question_id,question,is_yes,caption
0,25,25002,Could this photo be from a zoo?,True,A giraffe eating food from the top of the tree...
1,25,25003,Are the animals eating?,True,A giraffe eating food from the top of the tree...
2,25,25005,Is there a zebra?,False,A giraffe eating food from the top of the tree...
3,25,25007,Is the giraffe eating the tree?,True,A giraffe eating food from the top of the tree...
4,25,25008,Are both giraffes standing?,False,A giraffe eating food from the top of the tree...


In [3]:
def get_df(img_path, joint_df):
    img_names = next(os.walk(os.path.join(img_path)))[2]
    df_img_ids = set(joint_df.image_id)
    
    result_img_names = {}
    ids = set()
    for name in img_names:
        id_ = int(name.split('.')[0].split('_')[2])
        
        if id_ in df_img_ids:
            result_img_names[id_] = name
            ids.add(id_)
    
    ids_mask = joint_df.image_id.apply(lambda x: x in ids)
    result_df = joint_df[ids_mask]
    result_df['img_path'] = result_df.image_id.apply(lambda id_: os.path.join(img_path, result_img_names[id_]))
    
    return result_df

train_df = get_df(train_img_path, joint_df)
val_df = get_df(val_img_path, joint_df)

val_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0,image_id,question_id,question,is_yes,caption,img_path
19,64,64001,Are there numbers on the clock face?,False,a black and silver clock tower at an intersect...,images/val/COCO_train2014_000000000064.jpg
20,64,64002,Is there a big tree behind the clock?,True,a black and silver clock tower at an intersect...,images/val/COCO_train2014_000000000064.jpg
32,86,86000,Is that a bicycle?,True,A man riding a motor bike across a forest.A ma...,images/val/COCO_train2014_000000000086.jpg
35,92,92001,Does this cake look like it chocolate?,True,A white plate with a brownie and white frostin...,images/val/COCO_train2014_000000000092.jpg
89,332,332004,Are those worms on the bottom?,False,Food in a bowl with carrots and drinks around ...,images/val/COCO_train2014_000000000332.jpg


## Data preprocessing

In [0]:
def datagen(batch_size, df, img_resize_shape):
    df_copy = df.copy()[['question', 'caption', 'img_path', 'is_yes']]
    while True:
        if df_copy.shape[0] == 0:
            df_copy = df.copy()[['question', 'caption', 'img_path', 'is_yes']]
      
        img_pathes, images = df_copy['img_path'][:batch_size], []
        for img_path in img_pathes:
            img = Image.open(img_path)
            rsize = img.resize(img_resize_shape)    
            rsizeArr = np.asarray(rsize)     
            
            if rsizeArr.shape == img_resize_shape:  # for black and white pictures
                rsizeArr = np.dstack([rsizeArr]*3)    
            images.append(rsizeArr)
            
        images = np.array(images) / 255
        captions = df_copy['caption'][:batch_size]
        questions = df_copy['question'][:batch_size]
        labels = df_copy['is_yes'][:batch_size]
        
        df_copy = df_copy[batch_size:]
        
        yield [np.array(images), captions, questions], labels

## Model

In [5]:
from tensorflow.keras import layers
from tensorflow.keras.layers import Layer, concatenate
from tensorflow.python.keras.models import Model
from tensorflow.keras import backend as K
import tensorflow as tf
import tensorflow_hub as hub

class ElmoEmbeddingLayer(Layer):
    def __init__(self, **kwargs):
        self.dimensions = 1024
        self.trainable = True
        super(ElmoEmbeddingLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.elmo = hub.Module('https://tfhub.dev/google/elmo/2', 
                               trainable=self.trainable, 
                               name="{}_module".format(self.name))

        self._trainable_weights += tf.trainable_variables(scope="^{}_module/.*".format(self.name))
        super(ElmoEmbeddingLayer, self).build(input_shape)

    def call(self, x, mask=None):
        result = self.elmo(K.squeeze(K.cast(x, tf.string), axis=1),
                      as_dict=True,
                      signature='default',
                      )['default']
        return result

    def compute_mask(self, inputs, mask=None):
        return K.not_equal(inputs, '--PAD--')

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.dimensions)

W0503 18:59:55.350588 139867652085632 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [6]:
from tensorflow.keras.layers import Input, Flatten
from tensorflow.keras.applications import ResNet50


resnet_img_shape = (224,224)
img_shape = (*resnet_img_shape,3)
img_input = Input(img_shape, name='image_input')
res_base = ResNet50(weights='imagenet', input_tensor=img_input, 
                    include_top=False) # 175 layers

for l in res_base.layers:
    l.trainable = True

resn_flatten = Flatten(name='resn_flatten')(res_base.get_layer('activation_48').output)

input_caption = layers.Input(shape=(1,), dtype=tf.string)
caption_embedding = ElmoEmbeddingLayer()(input_caption)

input_question = layers.Input(shape=(1,), dtype=tf.string)
question_embedding = ElmoEmbeddingLayer()(input_question)

concat_embedding = concatenate([resn_flatten, caption_embedding, question_embedding], axis=1)

dense = layers.Dense(256, activation='relu')(concat_embedding)
pred = layers.Dense(1, activation='sigmoid')(dense)

model = Model(inputs=[img_input, input_caption, input_question], outputs=pred)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.


W0503 18:59:56.961009 139867652085632 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/resource_variable_ops.py:435: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0503 19:00:05.265174 139867652085632 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0503 19:00:06.943136 139867652085632 saver.py:1483] Saver not created because there are no variables in the graph to restore


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
image_input (InputLayer)        (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 230, 230, 3)  0           image_input[0][0]                
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 112, 112, 64) 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
bn_conv1 (BatchNormalizationV1) (None, 112, 112, 64) 256         conv1[0][0]                      
__________________________________________________________________________________________________
activation

## Training

In [0]:
epochs = 10
batch_size = 64
steps = train_df.shape[0] // batch_size

data_gen = datagen(batch_size, train_df, resnet_img_shape)

In [0]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
early_stopping_callback = EarlyStopping(monitor='acc', patience=2)

checkpoint_path = 'resnet_elmo_model.h5'
cp_callback = ModelCheckpoint(checkpoint_path, save_weights_only=True, verbose=1)



with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    model.fit_generator(data_gen, steps_per_epoch=steps, epochs=epochs,
        callbacks=[early_stopping_callback, cp_callback])