In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Hyperparameters
BATCH_SIZE = 32
BUFFER_SIZE = 1000
MAX_SEQ_LEN = 100
IMAGE_SIZE = (224, 224)
VOCAB_SIZE = 10000

# Load CSV
data = pd.read_csv('height_with_ocr.csv')

# Handle missing values
data = data.dropna(subset=['ocr_text', 'numeric_value', 'unit'])

# Calculate min and max values for normalization
numeric_min = data['numeric_value'].min()
numeric_max = data['numeric_value'].max()

def normalize_numeric_value(value, min_value, max_value):
    return (value - min_value) / (max_value - min_value)

# Build tokenizer and preprocess text
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(data['ocr_text'])

# Build StringLookup layer for entity classes
unit_classes = data['unit'].unique()
string_lookup = tf.keras.layers.StringLookup(vocabulary=unit_classes)

def preprocess_text(ocr_text):
    ocr_text = ocr_text.numpy().decode('utf-8')  # Ensure the input is a string
    tokenized_text = tokenizer.texts_to_sequences([ocr_text])
    padded_text = pad_sequences(tokenized_text, maxlen=MAX_SEQ_LEN, padding='post')
    return np.array(padded_text[0], dtype=np.int32) 

def preprocess_image(image_path):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = image / 255.0
    return image

def preprocess_data(image_path, ocr_text, numeric_value, unit_class):
    image = preprocess_image(image_path)
    ocr_text = tf.py_function(preprocess_text, [ocr_text], tf.int32)  # Ensure output is int32
    ocr_text.set_shape([MAX_SEQ_LEN])  # Set the shape explicitly

    numeric_value_normalized = normalize_numeric_value(numeric_value, numeric_min, numeric_max)
    numeric_value_normalized = tf.expand_dims(numeric_value_normalized, axis=-1)  # Ensure scalar for numeric_value

    unit_class = string_lookup(unit_class)
    return (image, ocr_text), (numeric_value_normalized, unit_class)

def load_dataset(csv_file):
    data = pd.read_csv(csv_file)
    data = data.dropna(subset=['ocr_text', 'numeric_value', 'unit'])  # Drop rows with NaN values
    image_paths = data['id'].apply(lambda x: f'datasets/{x}.jpg').values
    ocr_texts = data['ocr_text'].values
    numeric_values = data['numeric_value'].values.astype(np.float32)
    unit_classes = data['unit'].values
    return image_paths, ocr_texts, numeric_values, unit_classes

def create_tf_dataset(image_paths, ocr_texts, numeric_values, unit_classes):
    dataset = tf.data.Dataset.from_tensor_slices((image_paths, ocr_texts, numeric_values, unit_classes))
    dataset = dataset.map(lambda image_path, ocr_text, numeric_value, unit_class: (
        (image_path, ocr_text), (numeric_value, unit_class)
    ))
    dataset = dataset.map(lambda inputs, targets: preprocess_data(inputs[0], inputs[1], targets[0], targets[1]), num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.filter(lambda inputs, targets: inputs is not None and targets is not None)
    dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    return dataset

# Load data
image_paths, ocr_texts, numeric_values, unit_classes = load_dataset('height_with_ocr.csv')

# Create train and validation datasets
train_size = int(0.8 * len(image_paths))
train_image_paths = image_paths[:train_size]
train_ocr_texts = ocr_texts[:train_size]
train_numeric_values = numeric_values[:train_size]
train_unit_classes = unit_classes[:train_size]

val_image_paths = image_paths[train_size:]
val_ocr_texts = ocr_texts[train_size:]
val_numeric_values = numeric_values[train_size:]
val_unit_classes = unit_classes[train_size:]

train_dataset = create_tf_dataset(train_image_paths, train_ocr_texts, train_numeric_values, train_unit_classes)
val_dataset = create_tf_dataset(val_image_paths, val_ocr_texts, val_numeric_values, val_unit_classes)


2024-09-13 17:20:39.253534: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-13 17:20:39.264418: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-13 17:20:39.276571: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-13 17:20:39.280048: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-13 17:20:39.289664: I tensorflow/core/platform/cpu_feature_guar

In [2]:
from tensorflow.keras.layers import Input, Dense, Embedding, GlobalAveragePooling1D, GlobalAveragePooling2D, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError, SparseCategoricalCrossentropy

# Load ResNet model
resnet_model = ResNet50(
    include_top=False,
    weights='imagenet',
    input_shape=IMAGE_SIZE + (3,)
)

# Freeze the ResNet model
resnet_model.trainable = False

# Image input
image_input = Input(shape=IMAGE_SIZE + (3,), name='image_input')
image_features = resnet_model(image_input)
image_features = GlobalAveragePooling2D()(image_features)

# Text input
text_input = Input(shape=(MAX_SEQ_LEN,), name='text_input')
text_embedding = Embedding(input_dim=VOCAB_SIZE, output_dim=128)(text_input)
text_features = GlobalAveragePooling1D()(text_embedding)

# Concatenate image and text features
combined_features = Concatenate()([image_features, text_features])

# Output layers
numeric_output = Dense(1, name='numeric_output')(combined_features)
unit_output = Dense(len(np.unique(unit_classes)), activation='softmax', name='unit_output')(combined_features)

# Define custom loss function
def custom_loss(y_true, y_pred):
    
    numeric_loss = MeanSquaredError()(y_true[0], y_pred[0])
    unit_loss = SparseCategoricalCrossentropy()(y_true[1], y_pred[1])
    
    return numeric_loss + unit_loss

# Define the model
model = Model(inputs=[image_input, text_input], outputs=[numeric_output, unit_output])

# Compile the model with custom loss function
model.compile(optimizer=Adam(learning_rate=1e-4), 
              loss=custom_loss,
              metrics={'numeric_output': 'mae', 'unit_output': 'accuracy'})

In [3]:
print(train_numeric_values)

[1.18 6.5  9.2  ... 2.8  9.   8.66]


In [4]:
# Train the model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10
)

Epoch 1/10


I0000 00:00:1726228245.962749  137229 service.cc:146] XLA service 0x76d02004d960 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1726228245.962773  137229 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 4070 Laptop GPU, Compute Capability 8.9
2024-09-13 17:20:46.079156: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-09-13 17:20:46.712451: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907


      3/Unknown [1m9s[0m 53ms/step - loss: 2.9477 - numeric_output_loss: 1.4409 - numeric_output_mae: 1.2087 - unit_output_accuracy: 0.0000e+00 - unit_output_loss: 1.5068

I0000 00:00:1726228250.669899  137229 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


    392/Unknown [1m31s[0m 59ms/step - loss: nan - numeric_output_loss: nan - numeric_output_mae: nan - unit_output_accuracy: 0.8069 - unit_output_loss: nan

2024-09-13 17:21:13.738065: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
2024-09-13 17:21:13.738303: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[IteratorGetNext/_8]]
  self.gen.throw(typ, value, traceback)


[1m392/392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 90ms/step - loss: nan - numeric_output_loss: nan - numeric_output_mae: nan - unit_output_accuracy: 0.8072 - unit_output_loss: nan - val_loss: nan - val_numeric_output_loss: nan - val_numeric_output_mae: nan - val_unit_output_accuracy: 0.9879 - val_unit_output_loss: nan
Epoch 2/10


2024-09-13 17:21:26.030436: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[IteratorGetNext/_2]]
2024-09-13 17:21:26.030485: I tensorflow/core/framework/local_rendezvous.cc:423] Local rendezvous recv item cancelled. Key hash: 10595772756483887675
2024-09-13 17:21:26.030493: I tensorflow/core/framework/local_rendezvous.cc:423] Local rendezvous recv item cancelled. Key hash: 12580291881751655384
2024-09-13 17:21:26.030503: I tensorflow/core/framework/local_rendezvous.cc:423] Local rendezvous recv item cancelled. Key hash: 548844677129314260


[1m117/392[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m13s[0m 51ms/step - loss: nan - numeric_output_loss: nan - numeric_output_mae: nan - unit_output_accuracy: 0.9860 - unit_output_loss: nan

In [None]:
history2 = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10
)

In [14]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Hyperparameters
IMAGE_SIZE = (224, 224)
MAX_SEQ_LEN = 100
VOCAB_SIZE = 10000

# Assuming tokenizer and string_lookup are already defined
# tokenizer = ...
# string_lookup = ...

def preprocess_image(image_path):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = image / 255.0
    return image

def preprocess_text(ocr_text):
    tokenized_text = tokenizer.texts_to_sequences([ocr_text])
    padded_text = pad_sequences(tokenized_text, maxlen=MAX_SEQ_LEN, padding='post')
    return padded_text[0]

In [15]:
def predict(image_path, ocr_text):
    # Preprocess the inputs
    image = preprocess_image(image_path)
    text = preprocess_text(ocr_text)
    
    # Add batch dimension
    image = tf.expand_dims(image, axis=0)
    text = tf.expand_dims(text, axis=0)
    
    # Make predictions
    numeric_prediction, unit_prediction = model.predict([image, text])
    
    # Post-process the outputs
    numeric_value = numeric_prediction[0][0]  # Since it's a scalar
    unit_class = string_lookup.get_vocabulary()[np.argmax(unit_prediction[0])]
    
    return numeric_value, unit_class

In [None]:
# Example usage
image_path = 'datasets/7195.jpg'
ocr_text = 'Product Details 1 Green Aventurine 0.94"" Net Weight:21g'

numeric_value, unit_class = predict(image_path, ocr_text)
print(f'Predicted Numeric Value: {numeric_value}')
print(f'Predicted Unit Class: {unit_class}')