In [11]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import TFAutoModelForSeq2SeqLM, TFAutoModel, TFBlipForConditionalGeneration
import tensorflow as tf
import pandas as pd
import numpy as np

In [None]:
# Load processor and model from Huggingface
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
# model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Convert model to TensorFlow model (optional, but necessary for full TensorFlow-based modifications)

tf_model = TFBlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", from_pt=True)

In [33]:
data = pd.read_csv('test.csv')
img_id = [str(x) for x in data['id'].values.tolist()]
units = [str(x) for x in data['unit'].values.tolist()]
entity_name = [str(x) for x in data['entity_name'].values.tolist()]
numeric_values = [str(x) for x in data['numeric_value'].values.tolist()]
numeric_values = processor.tokenizer(text = numeric_values, padding=True, truncation=True, max_length=8, return_tensors="tf")
units = processor.tokenizer(text = units, padding=True, truncation=True, max_length=20, return_tensors="tf")

In [39]:
import tensorflow as tf

class CustomDataset(tf.data.Dataset):
    def __new__(cls, img_ids, entity_names, units, numeric_values):
        # Create the dataset from the provided lists or arrays
        dataset = tf.data.Dataset.from_tensor_slices((img_ids, entity_names, units, numeric_values))
        
        # Apply the mapping function using the load_dataset
        dataset = dataset.map(
            lambda img_id, entity_name, units, numeric_value: cls.map_fn(img_id, entity_name, units, numeric_value),
            num_parallel_calls=tf.data.experimental.AUTOTUNE
        )
        
        return dataset

    @staticmethod
    def load_dataset(img_id, entity_name, units, numeric_value):
        img_dir = 'datasets/test_images/'
        path = img_dir + img_id + '.jpg'
        img = tf.io.read_file(path)
        img = tf.image.decode_jpeg(img, channels=3)
        img = tf.image.resize(img, (224, 224))
        img = img / 255.0  # Normalize

        return img, entity_name, units, numeric_value

    @staticmethod
    def process_text(entity_name, img):
        # The processor function, executed dynamically
        def processor_fn(entity_name_str, img_tensor):
            # Assuming 'processor' is a pre-defined object
            encoding = processor(image=img_tensor, text=entity_name_str.decode('utf-8'), return_tensors="tf", padding=True, truncation=True)
            return encoding['pixel_values'][0], encoding['input_ids'][0], encoding['attention_mask'][0]
        
        # Using tf.py_function to wrap Python's processor function
        img, input_ids, attention_mask = tf.py_function(
            func=processor_fn,
            inp=[entity_name, img],
            Tout=[tf.float32, tf.int32, tf.int32]
        )
        return img, input_ids, attention_mask

    @staticmethod
    def map_fn(img_id, entity_name, units, numeric_value):
        img, entity_name, units, numeric_value = CustomDataset.load_dataset(img_id, entity_name, units, numeric_value)
        img, input_ids, attention_mask = CustomDataset.process_text(entity_name, img)
        return {
            "pixel_values": img,
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels_units": units,
            "labels_numeric_values": numeric_value
        } 



custom_dataset = CustomDataset(img_id, entity_name, units, numeric_values)

# Prepare the dataset for batching and optimization
custom_dataset = custom_dataset.batch(16).prefetch(tf.data.experimental.AUTOTUNE)


In [10]:
print(tf_model.summary())

Model: "tf_blip_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 vision_model (TFBlipVision  multiple                  86090496  
 Model)                                                          
                                                                 
 text_decoder (TFBlipTextLM  multiple                  161323580 
 HeadModel)                                                      
                                                                 
Total params: 247414076 (943.81 MB)
Trainable params: 247414076 (943.81 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [7]:
NUMERIC_VOCAB_SIZE = 11

# Modify the numeric output head to generate a sequence of 7 characters
class CustomBlipModel(tf.keras.Model):
    def __init__(self, base_model, max_numeric_length=7):
        super(CustomBlipModel, self).__init__()
        self.base_model = base_model
        self.max_numeric_length = max_numeric_length
        
        # Define a separate head for the numeric output (predicting a sequence of length 7)
        self.numeric_output = tf.keras.layers.TimeDistributed(
            tf.keras.layers.Dense(units=NUMERIC_VOCAB_SIZE, activation='softmax'),
            name='numeric_output'
        )
        
        # Word output head
        self.word_output = tf.keras.layers.Dense(units=base_model., activation='softmax', name='word_output')
    
    def call(self, inputs):
        # Get the base model output
        base_output = self.base_model(inputs)[0]  # Shape: (batch_size, seq_len, hidden_size)
        
        # Split into numeric and word outputs
        numeric_output = self.numeric_output(base_output[:, :self.max_numeric_length, :])  # Sequence for numeric string
        word_output = self.word_output(base_output[:, self.max_numeric_length:, :])  # For word tokens
        
        return numeric_output, word_output

# Create the custom model
custom_model = CustomBlipModel(tf_model)


SyntaxError: invalid syntax (561471154.py, line 17)

In [None]:
import tensorflow as tf
from transformers import TFAutoModelForSeq2SeqLM

# Define the numeric vocabulary size (10 digits + 1 decimal point)
NUMERIC_VOCAB_SIZE = 11

# Modify the numeric output head to generate a sequence of 7 characters
class CustomBlipModel(tf.keras.Model):
    def __init__(self, base_model, max_numeric_length=7):
        super(CustomBlipModel, self).__init__()
        self.base_model = base_model
        self.max_numeric_length = max_numeric_length
        
        # Define a separate head for the numeric output (predicting a sequence of length 7)
        self.numeric_output = tf.keras.layers.TimeDistributed(
            tf.keras.layers.Dense(units=NUMERIC_VOCAB_SIZE, activation='softmax'),
            name='numeric_output'
        )
        
        # Word output head
        self.word_output = tf.keras.layers.Dense(units=base_model.config.vocab_size, activation='softmax', name='word_output')
    
    def call(self, inputs):
        # Get the base model output
        base_output = self.base_model(inputs)[0]  # Shape: (batch_size, seq_len, hidden_size)
        
        # Split into numeric and word outputs
        numeric_output = self.numeric_output(base_output[:, :self.max_numeric_length, :])  # Sequence for numeric string
        word_output = self.word_output(base_output[:, self.max_numeric_length:, :])  # For word tokens
        
        return numeric_output, word_output

# Create the custom model
custom_model = CustomBlipModel(tf_model)
