# Stacking Early Fusion model - EfficientNetV2 + anglE

In [1]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import callbacks
from tensorflow.keras import optimizers
from tensorflow.keras import utils
from tensorflow.keras import models
from tensorflow.keras.preprocessing.image import ImageDataGenerator 
from tensorflow.keras import applications
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.regularizers import l2, l1

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix

import bert

import numpy as np 
import pandas as pd
import re
import glob
import os
import cv2
import sys
import pickle

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
     

2023-12-14 08:00:40.541961: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-14 08:00:40.676571: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-14 08:00:41.318399: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-12-14 08:00:41.318454: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

In [2]:
# Import the training and test .csv files
colnames=['image_path', 'text', 'food']
train = pd.read_csv('train_titles.csv', names=colnames, header=None, sep = ',', index_col=['image_path'])
test = pd.read_csv('test_titles.csv', names=colnames, header=None, sep = ',', index_col=['image_path'])

In [3]:
# Sort values by 'image_path'
test = test.sort_values('image_path')
train = train.sort_values('image_path')

In [4]:
print(test.shape)
test.head()

(22716, 2)


Unnamed: 0_level_0,text,food
image_path,Unnamed: 1_level_1,Unnamed: 2_level_1
apple_pie_1.jpg,Apple Pie and Cake Recipes - Health.com,apple_pie
apple_pie_10.jpg,Favorite Dutch Apple Pie Recipe | Taste of Home,apple_pie
apple_pie_103.jpg,No Raisins On My Parade: Caramel Apple Pie,apple_pie
apple_pie_105.jpg,Easy Apple Recipes - Apple Dessert Recipes'],apple_pie
apple_pie_107.jpg,Casa de Luna Creations: Apple Pie from Scratch,apple_pie


In [5]:
# Cleaning text function

def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    sentence = sentence.lower()

    return sentence

def remove_tags(text):
    return TAG_RE.sub('', text)

TAG_RE = re.compile(r'<[^>]+>')
vec_preprocess_text = np.vectorize(preprocess_text)

In [6]:
# Check number of classes
nClasses = train.food.nunique()

In [7]:
encoder = LabelEncoder()
processed_train = vec_preprocess_text(train.text.values)
processed_test = vec_preprocess_text(test.text.values)


encoded_labels_train = encoder.fit_transform(train.food.values)
labels_train = utils.to_categorical(encoded_labels_train, nClasses)

encoded_labels_test = encoder.fit_transform(test.food.values)
labels_test = utils.to_categorical(encoded_labels_test, nClasses)

print("Processed text sample:", processed_train[0])
print("Shape of train labels:", labels_train.shape)

Processed text sample: apple pie wikipedia
Shape of train labels: (67972, 101)


In [None]:
from angle_emb import AnglE

angle = AnglE.from_pretrained('WhereIsAI/UAE-Large-V1', pooling_strategy='cls').cuda()

In [None]:
train_text = angle.encode(processed_train[0], to_numpy=True)
for i in range(1,67988):
    if i%5000==0:
        print(i/5000)
    vec = angle.encode(processed_train[i], to_numpy=True)
    train_text = np.concatentate((train_text,vec),axis=0)
np.load('text_train',train_text)

In [10]:
text_train = np.load('text_train.npy')
text_test = np.load('text_test.npy')

In [11]:
from keras import backend as K 
img_width = 299
img_height = 299

if K.image_data_format() == 'channels_first': 
    input_shape = (3, img_width, img_height) 
else: 
    input_shape = (img_width, img_height, 3) 

In [12]:
train_data_dir = 'images/train'
validation_data_dir = 'images/test'
nb_train_samples = 67988 
nb_validation_samples = 22716
n_classes = 101
epochs = 10
batch_size = 75

In [13]:
train_datagen = ImageDataGenerator(
    width_shift_range=0.2,  # randomly shift images horizontally (fraction of total width)
    height_shift_range=0.2,  # randomly shift images vertically (fraction of total height)
    horizontal_flip=True,  # randomly flip images
    zoom_range=[.8, 1],
    channel_shift_range=30,
    fill_mode='reflect')

test_datagen = ImageDataGenerator()

train_generator = train_datagen.flow_from_directory(
    train_data_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    seed = 11,
    class_mode='categorical')

validation_generator = test_datagen.flow_from_directory(
    validation_data_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    seed = 11,
    class_mode='categorical')

Found 67988 images belonging to 101 classes.
Found 22716 images belonging to 101 classes.


In [14]:
def train_datas_generator(text_data):
    while True:
        idx0=0
        cnt=0
        for batch in train_generator:
            cnt+=1
            idx1 = idx0 + batch[0].shape[0]
            yield [batch[0], text_data[idx0:idx1]], batch[1]
            idx0 = idx1
            if cnt == 906:
                break
            if idx1 >= 67988:
                idx0=0
                break

In [15]:
def test_datas_generator(text_data):
    while True:
        idx0=0
        cnt=0
        for batch in validation_generator:
            cnt+=1
            idx1 = idx0 + batch[0].shape[0]
            yield [batch[0], text_data[idx0:idx1]], batch[1]
            idx0 = idx1
            if cnt==302:
                break
            if idx1 >= 22716:
                idx0=0
                break

**Classification Model**

In [16]:
from keras.applications.efficientnet_v2 import EfficientNetV2S

In [17]:

model_cnn = models.Sequential()
model_cnn.add(EfficientNetV2S(weights='imagenet', include_top=False, input_tensor=layers.Input(shape=(299, 299, 3))))
model_cnn.add(layers.AveragePooling2D(pool_size=(8, 8), name='AVG_Pooling'))
model_cnn.add(layers.Dropout(.4, name='Dropout_0.4'))
model_cnn.add(layers.Flatten(name='Flatten'))
model_cnn.add(layers.Dense(128, name='Dense_128',kernel_regularizer=l2(0.005),
                    activity_regularizer=l1(0.005) ))

2023-12-14 08:01:11.320136: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 34979 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:c2:00.0, compute capability: 8.6


In [18]:
for layer in model_cnn.layers:
    layer.trainable = True

In [18]:
model_cnn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 efficientnetv2-s (Functiona  (None, 10, 10, 1280)     20331360  
 l)                                                              
                                                                 
 AVG_Pooling (AveragePooling  (None, 1, 1, 1280)       0         
 2D)                                                             
                                                                 
 Dropout_0.4 (Dropout)       (None, 1, 1, 1280)        0         
                                                                 
 Flatten (Flatten)           (None, 1280)              0         
                                                                 
 Dense_128 (Dense)           (None, 128)               163968    
                                                                 
Total params: 20,495,328
Trainable params: 20,341,456
No

In [14]:
#------------------------------------------------------------------------------------------------------------#

In [19]:
# text model
input_text = layers.Input(shape=(1024,), dtype=tf.int32, name="input_text")
X = layers.Dense(512, activation="relu")(input_text)
X = layers.Dropout(0.5)(X)
X = layers.Dense(256, activation="relu")(X)
X = layers.Dropout(0.5)(X)
out = layers.Dense(128, activation="relu")(X)
model_text = models.Model(input_text, out)

In [21]:

for layer in model_text.layers:
    layer.trainable = True

In [22]:
#------------------------------------------------------------------------------------------------------------#

In [23]:
# Stacking


image_input = layers.Input(shape = input_shape, dtype=tf.float32,
                           name = "image")
text_input = layers.Input(shape=(1024,), dtype=tf.float32,
                                       name="text")

image_side = model_cnn(image_input)
text_side = model_text(text_input)

merged = layers.Concatenate()([image_side, text_side])
#merged = layers.Dense(512, activation = 'relu')(merged)
#merged = layers.Dropout(0.4)(merged)
output = layers.Dense(101, activation='softmax', name = "class")(merged)

In [24]:
model = models.Model([image_input, text_input], output)

In [25]:
#-----------------------SUMMARY MULTIMODAL MODEL--------------------------------------------------------------#

In [26]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 image (InputLayer)             [(None, 299, 299, 3  0           []                               
                                )]                                                                
                                                                                                  
 text (InputLayer)              [(None, 1024)]       0           []                               
                                                                                                  
 sequential (Sequential)        (None, 128)          20495328    ['image[0][0]']                  
                                                                                                  
 model_1 (Functional)           (None, 128)          689024      ['text[0][0]']             

In [51]:
from tensorflow.keras.utils import plot_model
plot_model(model, to_file='model.png')

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [50]:
!pip install pydot

Defaulting to user installation because normal site-packages is not writeable
[33mDEPRECATION: distro-info 1.1build1 has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of distro-info or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m[33mDEPRECATION: python-debian 0.1.43ubuntu1 has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of python-debian or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [105]:
#------------------------------------------------------------------------------------------------------------#

In [25]:

sgd = optimizers.SGD(learning_rate=0.01, momentum=0.9, nesterov=False)


model.compile(loss='categorical_crossentropy', 
              optimizer=sgd, 
              metrics=['accuracy'])

In [26]:

csv_logger = callbacks.CSVLogger('stacking/stacking.log')
es = callbacks.EarlyStopping(patience = 3, restore_best_weights=True)

In [27]:

reduce_lr = callbacks.ReduceLROnPlateau(
    monitor='val_accuracy', factor=0.1, patience=1, min_lr=0.00001)

In [28]:
# Load model weights
#model.load_weights('stacking_early_fusion/weights-improvement-16-0.92.hdf5')

In [33]:
# 128 128 바로 softmax
with tf.device('/device:GPU:4'): 
    history = model.fit(train_datas_generator(text_train),
                        epochs=15,
                        steps_per_epoch = train.shape[0]//batch_size,
                        validation_data = test_datas_generator(text_test),
                        validation_steps = test.shape[0]//batch_size,
                        callbacks=[csv_logger, reduce_lr])

Epoch 1/15
123/906 [===>..........................] - ETA: 16:26 - loss: 0.5736 - accuracy: 0.9239

KeyboardInterrupt: 