<p style="text-align:center;font-weight: 900; font-size:40px;"> Multimodal Sentiment Analysis Higher Accuracy </p>

**More robust vgg19 and xlm-roberta******

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from transformers import TFXLMRobertaModel, XLMRobertaTokenizer
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import os


# Load and preprocess data
def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df


def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]


def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))


def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)


def augment_images(images):
    """Apply data augmentation"""
    datagen = ImageDataGenerator(
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )
    return datagen.flow(images, batch_size=16, shuffle=True)


# Multimodal Model Definition
class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        # Use XLM-Roberta for multilingual capabilities
        self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
        self.bert_model = TFXLMRobertaModel.from_pretrained('xlm-roberta-base')

    def build_model(self):
        # Image branch (VGG19 with fine-tuning)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)

        for layer in vgg19.layers[:-8]:  # Fine-tune deeper layers
            layer.trainable = False

        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.4)(x)
        image_features = LayerNormalization()(x)

        # Text branch (XLM-Roberta)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])
        pooled_output = bert_outputs[1]  # Use the pooled output
        x = Dense(256, activation='relu')(pooled_output)
        x = Dropout(0.4)(x)
        text_features = LayerNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        optimizer = Adam(learning_rate=1e-4)  # Reduce learning rate
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return model

    def prepare_text(self, texts):
        """Tokenize texts using XLM-Roberta tokenizer"""
        encodings = self.tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']


# Model Training
def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()

    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)

    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=5,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=3,
            min_lr=1e-6
        )
    ]

    # Data augmentation for images
    train_images_augmented = augment_images(train_images)

    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=16,
        callbacks=callbacks
    )

    return model, history


# Main Function
def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )

    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())

    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)

    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])

    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )

    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )

    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])

    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)

    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]

    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")

    return model, history


if __name__ == "__main__":
    main()


Processing images:  67%|██████▋   | 2325/3495 [00:51<00:30, 38.45it/s]

Error processing image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)


Processing images: 100%|██████████| 3495/3495 [01:16<00:00, 45.73it/s]
Processing images: 100%|██████████| 873/873 [00:19<00:00, 45.40it/s]
All model checkpoint layers were used when initializing TFXLMRobertaModel.

All the layers of TFXLMRobertaModel were initialized from the model checkpoint at xlm-roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from transformers import TFXLMRobertaModel, XLMRobertaTokenizer
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
import os
import math


# Load and preprocess data
def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df


def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]


def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))


def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)


def augment_images(images):
    """Apply stronger data augmentation"""
    datagen = ImageDataGenerator(
        rotation_range=30,
        width_shift_range=0.3,
        height_shift_range=0.3,
        shear_range=0.3,
        zoom_range=0.3,
        horizontal_flip=True,
        fill_mode='nearest',
        brightness_range=[0.8, 1.2]
    )
    return datagen.flow(images, batch_size=16, shuffle=True)


# Multimodal Model Definition
class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        # Use XLM-Roberta for multilingual capabilities
        self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
        self.bert_model = TFXLMRobertaModel.from_pretrained('xlm-roberta-base')

    def build_model(self):
        # Image branch (VGG19 with fine-tuning)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)

        for layer in vgg19.layers[:-8]:  # Fine-tune deeper layers
            layer.trainable = False

        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(x)
        x = Dropout(0.4)(x)
        image_features = BatchNormalization()(x)

        # Text branch (XLM-Roberta)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])
        pooled_output = bert_outputs[1]  # Use the pooled output
        x = Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(pooled_output)
        x = Dropout(0.4)(x)
        text_features = BatchNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(combined)
        x = Dropout(0.3)(x)
        x = BatchNormalization()(x)
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        optimizer = Adam(learning_rate=1e-4)  # Reduce learning rate
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return model

    def prepare_text(self, texts):
        """Tokenize texts using XLM-Roberta tokenizer"""
        encodings = self.tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']


# Cosine Annealing Learning Rate Scheduler
def cosine_decay(epoch):
    initial_lr = 1e-4
    return initial_lr * (1 + math.cos(epoch * math.pi / 20)) / 2


# Model Training
def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()

    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)

    # Compute class weights for imbalanced datasets
    class_weights = class_weight.compute_class_weight(
        class_weight='balanced',
        classes=np.unique(train_labels),
        y=train_labels
    )
    class_weights = dict(enumerate(class_weights))

    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=2,  # Stop earlier
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        ),
        tf.keras.callbacks.LearningRateScheduler(cosine_decay)
    ]

    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        class_weight=class_weights,
        epochs=epochs,
        batch_size=16,
        callbacks=callbacks
    )

    return model, history


# Main Function
def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )

    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())

    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)

    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])

    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )

    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )

    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])

    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)

    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]

    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")

    return model, history


if __name__ == "__main__":
    main()


Processing images:  67%|██████▋   | 2325/3495 [00:51<00:29, 40.26it/s]

Error processing image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)


Processing images: 100%|██████████| 3495/3495 [01:17<00:00, 45.14it/s]
Processing images: 100%|██████████| 873/873 [00:19<00:00, 43.70it/s]
All model checkpoint layers were used when initializing TFXLMRobertaModel.

All the layers of TFXLMRobertaModel were initialized from the model checkpoint at xlm-roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


All model checkpoint layers were used when initializing TFXLMRobertaModel.

All the layers of TFXLMRobertaModel were initialized from the model checkpoint at xlm-roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.


Predictions saved to submission.csv


This vgg19 and bert model

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)
        
        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False
            
        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)
        
        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        
        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)
        
        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=0.001)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=16,
        
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


Processing images:  67%|██████▋   | 2325/3495 [01:08<00:39, 29.34it/s]

Error processing image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)


Processing images: 100%|██████████| 3495/3495 [01:41<00:00, 34.48it/s]
Processing images: 100%|██████████| 873/873 [00:24<00:00, 35.01it/s]
Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
 28/186 [===>..........................] - ETA: 1:47 - loss: 0.8356 - accuracy: 0.6272

In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)
        
        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False
            
        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)
        
        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        
        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)
        
        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=30):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=16,
        
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


Processing images:  67%|██████▋   | 2326/3495 [00:51<00:29, 39.76it/s]

Error processing image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)


Processing images: 100%|██████████| 3495/3495 [01:17<00:00, 45.02it/s]
Processing images: 100%|██████████| 873/873 [00:19<00:00, 44.10it/s]
Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Predictions saved to submission.csv


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)
        
        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False
            
        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(512, activation='relu')(x)
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)
        
        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        
        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(512, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)
        
        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(512, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=10):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=8,
        callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)
        
        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False
            
        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(512, activation='relu')(x)
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)
        
        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        
        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(512, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)
        
        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(512, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=30):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=8,
        callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


Processing images:  67%|██████▋   | 2327/3495 [00:52<00:29, 39.91it/s]

Error processing image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)


Processing images: 100%|██████████| 3495/3495 [01:18<00:00, 44.76it/s]
Processing images: 100%|██████████| 873/873 [00:19<00:00, 44.84it/s]
Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30


Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Predictions saved to submission.csv


In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)
        
        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False
            
        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(512, activation='relu')(x)
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)
        
        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        
        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(512, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)
        
        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(512, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=8,
        
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


Processing images:  67%|██████▋   | 2325/3495 [00:51<00:28, 41.07it/s]

Error processing image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)


Processing images: 100%|██████████| 3495/3495 [01:16<00:00, 45.62it/s]
Processing images: 100%|██████████| 873/873 [00:19<00:00, 44.66it/s]
Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Predictions saved to submission.csv


Less layer

In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)
        
        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False
            
        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)
        
        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        
        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)
        
        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=8,
        
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


Processing images:  67%|██████▋   | 2327/3495 [00:50<00:27, 42.25it/s]

Error processing image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)


Processing images: 100%|██████████| 3495/3495 [01:15<00:00, 46.24it/s]
Processing images: 100%|██████████| 873/873 [00:19<00:00, 44.60it/s]
Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Predictions saved to submission.csv


In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)
        
        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False
            
        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)
        
        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        
        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)
        
        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=16,
        
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


Processing images:  66%|██████▋   | 2324/3495 [01:13<00:39, 29.63it/s]

Error processing image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)


Processing images: 100%|██████████| 3495/3495 [01:49<00:00, 31.83it/s]
Processing images: 100%|██████████| 873/873 [00:27<00:00, 31.61it/s]


Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Predictions saved to submission.csv


Edited

In [4]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm


def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df


def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]


def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))


def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)


class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')

    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)

        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False

        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)

        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return model

    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']


def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, class_weights, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()

    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)

    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]

    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        class_weight=class_weights,
        epochs=epochs,
        batch_size=16,
        callbacks=callbacks
    )

    return model, history


def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )

    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())

    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)

    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])

    # Compute class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
    class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )

    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs,
        class_weights_dict
    )

    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])

    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)

    # Evaluate model
    print(classification_report(test_df['Label_Sentiment'].map(label_map), predicted_labels))
    print(confusion_matrix(test_df['Label_Sentiment'].map(label_map), predicted_labels))

    # Save predictions
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")


if __name__ == "__main__":
    main()


Processing images:  67%|██████▋   | 2327/3495 [00:51<00:29, 40.17it/s]

Error processing image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)


Processing images: 100%|██████████| 3495/3495 [01:17<00:00, 45.10it/s]
Processing images: 100%|██████████| 873/873 [00:19<00:00, 44.73it/s]
Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.




KeyError: 'Label_Sentiment'

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from transformers import TFXLMRobertaModel, XLMRobertaTokenizer
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os
import math
import tensorflow_addons as tfa

# Load and preprocess data
def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df


def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]


def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))


def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)


def augment_images(images):
    """Apply stronger data augmentation"""
    datagen = ImageDataGenerator(
        rotation_range=30,
        width_shift_range=0.3,
        height_shift_range=0.3,
        shear_range=0.3,
        zoom_range=0.3,
        horizontal_flip=True,
        fill_mode='nearest',
        brightness_range=[0.8, 1.2]
    )
    return datagen.flow(images, batch_size=16, shuffle=True)


# Multimodal Model Definition
class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        # Use XLM-Roberta for multilingual capabilities
        self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
        self.bert_model = TFXLMRobertaModel.from_pretrained('xlm-roberta-base')

    def build_model(self):
        # Image branch (VGG19 with fine-tuning)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)

        for layer in vgg19.layers[:-8]:  # Fine-tune deeper layers
            layer.trainable = False

        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(x)
        x = Dropout(0.4)(x)
        image_features = BatchNormalization()(x)

        # Text branch (XLM-Roberta)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])
        pooled_output = bert_outputs[1]  # Use the pooled output
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(pooled_output)
        x = Dropout(0.4)(x)
        text_features = BatchNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(combined)
        x = Dropout(0.3)(x)
        x = BatchNormalization()(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        return model

    def prepare_text(self, texts):
        """Tokenize texts using XLM-Roberta tokenizer"""
        encodings = self.tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']


# Cosine Annealing Learning Rate Scheduler
def lr_schedule(epoch):
    initial_lr = 1e-4
    if epoch < 5:  # Warm-up phase
        return initial_lr * (epoch + 1) / 5
    else:
        return initial_lr * tf.math.exp(-0.1 * (epoch - 5))


# Model Training
def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()

    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)

    # Compute class weights for imbalanced datasets
    class_weights = class_weight.compute_class_weight(
        class_weight='balanced',
        classes=np.unique(train_labels),
        y=train_labels
    )
    class_weights = dict(enumerate(class_weights))

    # Focal loss for better handling of class imbalance
    loss = tfa.losses.SigmoidFocalCrossEntropy()

    # Compile the model
    model.compile(
        optimizer=Adam(learning_rate=1e-4),
        loss=loss,
        metrics=['accuracy']
    )

    # Callbacks
    lr_callback = tf.keras.callbacks.LearningRateScheduler(lr_schedule)
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        lr_callback
    ]

    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        class_weight=class_weights,
        epochs=epochs,
        batch_size=32,
        callbacks=callbacks
    )

    return model, history


# Main Function
def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )

    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())

    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)

    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])

    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )

    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )

    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])

    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)

    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]

    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")

    return model, history


if __name__ == "__main__":
    main()


Processing images:  67%|██████▋   | 2325/3495 [01:13<00:41, 28.19it/s]

Error processing image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)


Processing images: 100%|██████████| 3495/3495 [01:49<00:00, 31.90it/s]
Processing images: 100%|██████████| 873/873 [00:27<00:00, 31.90it/s]


Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFXLMRobertaModel.

All the layers of TFXLMRobertaModel were initialized from the model checkpoint at xlm-roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5
Epoch 1/20


ValueError: in user code:

    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1051, in train_function  *
        return step_function(self, iterator)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1040, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1030, in run_step  **
        outputs = model.train_step(data)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 890, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 949, in compute_loss
        y, y_pred, sample_weight, regularization_losses=self.losses)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/compile_utils.py", line 238, in __call__
        total_loss_metric_value = tf.add_n(loss_metric_values)

    ValueError: Shapes must be equal rank, but are 1 and 0
    	From merging shape 0 with other shapes. for '{{node AddN_1}} = AddN[N=2, T=DT_FLOAT](sigmoid_focal_crossentropy/weighted_loss/Mul, AddN)' with input shapes: [?], [].


Final

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from transformers import TFXLMRobertaModel, XLMRobertaTokenizer
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os
import math

# Load and preprocess data
def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df


def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]


def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))


def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)


def augment_images(images):
    """Apply stronger data augmentation"""
    datagen = ImageDataGenerator(
        rotation_range=30,
        width_shift_range=0.3,
        height_shift_range=0.3,
        shear_range=0.3,
        zoom_range=0.3,
        horizontal_flip=True,
        fill_mode='nearest',
        brightness_range=[0.8, 1.2]
    )
    return datagen.flow(images, batch_size=16, shuffle=True)


# Multimodal Model Definition
class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        # Use XLM-Roberta for multilingual capabilities
        self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
        self.bert_model = TFXLMRobertaModel.from_pretrained('xlm-roberta-base')

    def build_model(self):
        # Image branch (VGG19 with fine-tuning)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)

        for layer in vgg19.layers[:-8]:  # Fine-tune deeper layers
            layer.trainable = False

        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(x)
        x = Dropout(0.4)(x)
        image_features = BatchNormalization()(x)

        # Text branch (XLM-Roberta)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])
        pooled_output = bert_outputs[1]  # Use the pooled output
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(pooled_output)
        x = Dropout(0.4)(x)
        text_features = BatchNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(combined)
        x = Dropout(0.3)(x)
        x = BatchNormalization()(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        return model

    def prepare_text(self, texts):
        """Tokenize texts using XLM-Roberta tokenizer"""
        encodings = self.tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']


# Cosine Annealing Learning Rate Scheduler
def lr_schedule(epoch):
    initial_lr = 1e-4
    if epoch < 5:  # Warm-up phase
        return initial_lr * (epoch + 1) / 5
    else:
        return initial_lr * tf.math.exp(-0.1 * (epoch - 5))


# Model Training
def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()

    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)

    # Convert labels to one-hot encoding
    train_labels_onehot = tf.keras.utils.to_categorical(train_labels, num_classes=3)
    val_labels_onehot = tf.keras.utils.to_categorical(val_labels, num_classes=3)

    # Compute class weights for imbalanced datasets
    class_weights = class_weight.compute_class_weight(
        class_weight='balanced',
        classes=np.unique(train_labels),
        y=train_labels
    )
    class_weights = dict(enumerate(class_weights))

    # Compile the model
    model.compile(
        optimizer=Adam(learning_rate=1e-4),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    # Callbacks
    lr_callback = tf.keras.callbacks.LearningRateScheduler(lr_schedule)
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        lr_callback
    ]

    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels_onehot,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels_onehot
        ),
        class_weight=class_weights,
        epochs=epochs,
        batch_size=32,
        callbacks=callbacks
    )

    return model, history


# Main Function
def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )

    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())

    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)

    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])

    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )

    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )

    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])

    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)

    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]

    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")

    return model, history


if __name__ == "__main__":
    main()


Processing images:  67%|██████▋   | 2326/3495 [01:11<00:39, 29.51it/s]

Error processing image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)


Processing images: 100%|██████████| 3495/3495 [01:46<00:00, 32.67it/s]
Processing images: 100%|██████████| 873/873 [00:26<00:00, 32.99it/s]


Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFXLMRobertaModel.

All the layers of TFXLMRobertaModel were initialized from the model checkpoint at xlm-roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


All model checkpoint layers were used when initializing TFXLMRobertaModel.

All the layers of TFXLMRobertaModel were initialized from the model checkpoint at xlm-roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.


Predictions saved to submission.csv


In [2]:
pip install tensorflow numpy pandas transformers vit-keras tqdm scikit-learn pillow


/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Collecting vit-keras
  Downloading vit_keras-0.1.2-py3-none-any.whl (24 kB)
Collecting protobuf<3.20,>=3.9.2
  Downloading protobuf-3.19.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting validators
  Downloading validators-0.20.0.tar.gz (30 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: validators
  Building wheel for validators (setup.py) ... [?25ldone
[?25h  Created wheel for validators: filename=validators-0.20.0-py3-none-any.whl size=19582 sha256=0e80d0aa3d43caace847c9389f963294aad00996ec91addd3a2974edc0e1ff6d
  Stored in directory: /root/.cache/pip/wheels/5f/55/ab/36a76989f7f88d9ca7b1f68da6d94252bb6a8d6ad4f18e04e9
Successfully built validators
Installing collected packages: v

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Dense, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
from vit_keras import vit
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.imagenet_utils.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (Vision Transformer)
        vit_model = vit.vit_b16(
            image_size=224,
            pretrained=True,
            include_top=False,
            pretrained_top=False
        )
        image_input = Input(shape=(224, 224, 3), name='image_input')
        x = vit_model(image_input)  # Output: (None, 768)
        x = Dense(256, activation='relu')(x)  # Add Dense layer instead of pooling
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)

        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=16,
        callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


Processing images:  67%|██████▋   | 2325/3495 [01:01<00:34, 33.75it/s]

Error processing image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)


Processing images: 100%|██████████| 3495/3495 [01:30<00:00, 38.45it/s]
Processing images: 100%|██████████| 873/873 [00:22<00:00, 38.73it/s]
Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Predictions saved to submission.csv


In [2]:
pip install tensorflow numpy pandas transformers vit-keras tqdm scikit-learn pillow


/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Collecting vit-keras
  Downloading vit_keras-0.1.2-py3-none-any.whl (24 kB)
Collecting protobuf<3.20,>=3.9.2
  Downloading protobuf-3.19.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting validators
  Downloading validators-0.20.0.tar.gz (30 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: validators
  Building wheel for validators (setup.py) ... [?25ldone
[?25h  Created wheel for validators: filename=validators-0.20.0-py3-none-any.whl size=19582 sha256=d15fae45053b4072f048f1a3b59eb610e22a5b2924fa8c862dcb5b0321fdffb3
  Stored in directory: /root/.cache/pip/wheels/5f/55/ab/36a76989f7f88d9ca7b1f68da6d94252bb6a8d6ad4f18e04e9
Successfully built validators
Installing collected packages: v

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Dense, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
from vit_keras import vit
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.imagenet_utils.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (Vision Transformer)
        vit_model = vit.vit_b16(
            image_size=224,
            pretrained=True,
            include_top=False,
            pretrained_top=False
        )
        image_input = Input(shape=(224, 224, 3), name='image_input')
        x = vit_model(image_input)  # Output: (None, 768)
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)  # Add Dense layer with L2 regularization
        x = Dropout(0.4)(x)
        image_features = LayerNormalization()(x)

        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(pooled_output)
        x = Dropout(0.4)(x)
        text_features = LayerNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(combined)
        x = Dropout(0.4)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
        x = Dropout(0.3)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Compute class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
    class_weight_dict = dict(enumerate(class_weights))

    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',  # Monitor validation loss
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        ),
        tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-5 if epoch > 10 else 2e-5)
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=8,  # Decreased batch size for better generalization
        class_weight=class_weight_dict,  # Add class weights
        callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()



Processing images:  66%|██████▋   | 2324/3495 [01:15<00:39, 29.84it/s]

Error processing image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)


Processing images: 100%|██████████| 3495/3495 [01:56<00:00, 30.07it/s]
Processing images: 100%|██████████| 873/873 [00:30<00:00, 28.60it/s]


Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Downloading data from https://github.com/faustomorales/vit-keras/releases/download/dl/ViT-B_16_imagenet21k+imagenet2012.npz




Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
  9/372 [..............................] - ETA: 4:06 - loss: 10.5443 - accuracy: 0.3333

In [2]:
pip install tensorflow pandas numpy tqdm scikit-learn transformers vit-keras


/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Collecting vit-keras
  Downloading vit_keras-0.1.2-py3-none-any.whl (24 kB)
Collecting protobuf<3.20,>=3.9.2
  Downloading protobuf-3.19.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting validators
  Downloading validators-0.20.0.tar.gz (30 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: validators
  Building wheel for validators (setup.py) ... [?25ldone
[?25h  Created wheel for validators: filename=validators-0.20.0-py3-none-any.whl size=19582 sha256=c30035cff4317efce7adba940830654d609316bd1be3d897d53bee2033a0eb6b
  Stored in directory: /root/.cache/pip/wheels/5f/55/ab/36a76989f7f88d9ca7b1f68da6d94252bb6a8d6ad4f18e04e9
Successfully built validators
Installing collected packages: v

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Dense, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
from vit_keras import vit
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.imagenet_utils.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

# Image augmentation
def augment_images(images):
    datagen = ImageDataGenerator(
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )
    return datagen.flow(images, batch_size=len(images), shuffle=False).next()

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (Vision Transformer)
        vit_model = vit.vit_b16(
            image_size=224,
            pretrained=True,
            include_top=False,
            pretrained_top=False
        )
        image_input = Input(shape=(224, 224, 3), name='image_input')
        x = vit_model(image_input)  # Output: (None, 768)
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)  # Add Dense layer with L2 regularization
        x = Dropout(0.4)(x)
        image_features = LayerNormalization()(x)

        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(pooled_output)
        x = Dropout(0.4)(x)
        text_features = LayerNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(combined)
        x = Dropout(0.4)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
        x = Dropout(0.3)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Augment training images
    train_images = augment_images(train_images)

    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Compute class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
    class_weight_dict = dict(enumerate(class_weights))

    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        ),
        tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-5 if epoch > 10 else 2e-5)
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=8,
        class_weight=class_weight_dict,
        callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


Processing images:  66%|██████▋   | 2324/3495 [01:07<00:35, 32.64it/s]

Error processing image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)


Processing images: 100%|██████████| 3495/3495 [01:41<00:00, 34.46it/s]
Processing images: 100%|██████████| 873/873 [00:25<00:00, 34.03it/s]


Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Downloading data from https://github.com/faustomorales/vit-keras/releases/download/dl/ViT-B_16_imagenet21k+imagenet2012.npz




Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
  8/372 [..............................] - ETA: 4:12 - loss: 12.8576 - accuracy: 0.2656

In [2]:
pip install tensorflow pandas numpy tqdm scikit-learn transformers vit-keras langdetect googletrans==4.0.0-rc1


/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Collecting vit-keras
  Downloading vit_keras-0.1.2-py3-none-any.whl (24 kB)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting httpx==0.13.3
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpcore==0.9.*
  Downloading httpcore-0.9.1-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting c

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Dense, Input, Dropout, Concatenate, LayerNormalization, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
from tensorflow.keras.applications import VGG19
from tensorflow.keras.applications.vgg19 import preprocess_input as vgg_preprocess
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
from langdetect import detect
from googletrans import Translator

translator = Translator()

# Bengali stopwords list (manually curated or from libraries like bnltk)
BENGALI_STOPWORDS = set(["এবং", "কিন্তু", "যদি", "তবে", "অতএব", "অথচ", "যেমন", "তেমন", "কেন", "কখন", "যা", "তাহলে"])

# Function to detect language and transliterate Banglish to Bengali
def preprocess_text(text):
    try:
        lang = detect(text)
        if lang == "bn":  # Bengali
            text = text
        elif lang == "en":  # English
            text = text.lower()  # Convert to lowercase
        else:  # Banglish or other languages
            text = translator.translate(text, src="en", dest="bn").text

        # Remove special characters and punctuation
        text = ''.join(e for e in text if e.isalnum() or e.isspace())
        # Remove stopwords
        text = ' '.join(word for word in text.split() if word not in BENGALI_STOPWORDS)
    except Exception as e:
        print(f"Error processing text: {text}, {e}")
        return ""
    return text

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    # Preprocess captions
    train_df['Captions'] = train_df['Captions'].apply(preprocess_text)
    test_df['Captions'] = test_df['Captions'].apply(preprocess_text)

    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = vgg_preprocess(img)  # VGG19 preprocessing
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (VGG19)
        vgg19_base = VGG19(weights="imagenet", include_top=False, input_shape=(224, 224, 3))
        for layer in vgg19_base.layers:
            layer.trainable = False  # Freeze VGG19 layers

        image_input = Input(shape=(224, 224, 3), name='image_input')
        x = vgg19_base(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
        x = Dropout(0.4)(x)
        image_features = LayerNormalization()(x)

        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(pooled_output)
        x = Dropout(0.4)(x)
        text_features = LayerNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(combined)
        x = Dropout(0.4)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
        x = Dropout(0.3)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Compute class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
    class_weight_dict = dict(enumerate(class_weights))

    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        ),
        tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-5 if epoch > 10 else 2e-5)
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=8,
        class_weight=class_weight_dict,
        
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


Processing images:  67%|██████▋   | 2325/3495 [01:05<00:37, 31.06it/s]

Error processing image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)


Processing images: 100%|██████████| 3495/3495 [01:38<00:00, 35.53it/s]
Processing images: 100%|██████████| 873/873 [00:25<00:00, 34.67it/s]


Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


SCHOLAR

In [1]:
import torch
import torch.nn as nn
from torchvision import models, transforms
from transformers import BertModel, BertTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Image Preprocessing and Augmentation
image_transforms = {
    "train": transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(15),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])  # Standard normalization for ImageNet
    ]),
    "val": transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

# Define Dataset
class MultimodalDataset(Dataset):
    def __init__(self, image_paths, text_data, labels, transform=None, tokenizer=None):
        self.image_paths = image_paths
        self.text_data = text_data
        self.labels = labels
        self.transform = transform
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Load and preprocess image
        image = Image.open(self.image_paths[idx]).convert("RGB")
        if self.transform:
            image = self.transform(image)

        # Tokenize text
        text = self.text_data[idx]
        encoded_text = self.tokenizer(
            text, padding="max_length", truncation=True, max_length=128, return_tensors="pt"
        )

        # Extract inputs and attention mask
        input_ids = encoded_text["input_ids"].squeeze(0)
        attention_mask = encoded_text["attention_mask"].squeeze(0)

        label = torch.tensor(self.labels[idx])
        return image, input_ids, attention_mask, label

# Define Image Model (ResNet)
class ImageModel(nn.Module):
    def __init__(self, output_dim):
        super(ImageModel, self).__init__()
        resnet = models.resnet18(pretrained=True)
        self.feature_extractor = nn.Sequential(*list(resnet.children())[:-1])
        self.fc = nn.Linear(resnet.fc.in_features, output_dim)

    def forward(self, x):
        features = self.feature_extractor(x)
        features = features.view(features.size(0), -1)
        return self.fc(features)

# Define Text Model (BERT)
class TextModel(nn.Module):
    def __init__(self, output_dim):
        super(TextModel, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.fc = nn.Linear(self.bert.config.hidden_size, output_dim)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_token_output = outputs.last_hidden_state[:, 0, :]
        return self.fc(cls_token_output)

# Define Combined Model
class MultimodalModel(nn.Module):
    def __init__(self, img_output_dim, text_output_dim, final_output_dim):
        super(MultimodalModel, self).__init__()
        self.image_model = ImageModel(img_output_dim)
        self.text_model = TextModel(text_output_dim)
        self.fc = nn.Linear(img_output_dim + text_output_dim, final_output_dim)

    def forward(self, image, input_ids, attention_mask):
        img_features = self.image_model(image)
        text_features = self.text_model(input_ids, attention_mask)
        combined_features = torch.cat((img_features, text_features), dim=1)
        return self.fc(combined_features)

# Training Function
def train(model, dataloaders, criterion, optimizer, scheduler, device, epochs=10):
    best_acc = 0.0
    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        for phase in ["train", "val"]:
            if phase == "train":
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            preds, targets = [], []

            for images, input_ids, attention_mask, labels in dataloaders[phase]:
                images = images.to(device)
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()

                # Forward pass
                with torch.set_grad_enabled(phase == "train"):
                    outputs = model(images, input_ids, attention_mask)
                    loss = criterion(outputs, labels)
                    _, predictions = torch.max(outputs, 1)

                    if phase == "train":
                        loss.backward()
                        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
                        optimizer.step()

                # Track metrics
                running_loss += loss.item() * images.size(0)
                preds.extend(predictions.cpu().numpy())
                targets.extend(labels.cpu().numpy())

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = accuracy_score(targets, preds)
            print(f"{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}")

            # Track the best model
            if phase == "val" and epoch_acc > best_acc:
                best_acc = epoch_acc
                torch.save(model.state_dict(), "best_model.pth")
        scheduler.step()

    print(f"Best Validation Accuracy: {best_acc:.4f}")

# Evaluation Function
def evaluate(model, dataloader, device):
    model.eval()
    preds, targets = [], []

    with torch.no_grad():
        for images, input_ids, attention_mask, labels in dataloader:
            images = images.to(device)
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(images, input_ids, attention_mask)
            _, predictions = torch.max(outputs, 1)
            preds.extend(predictions.cpu().numpy())
            targets.extend(labels.cpu().numpy())

    # Compute Metrics
    acc = accuracy_score(targets, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(targets, preds, average="weighted")
    print(f"Accuracy: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

# Main Function
if __name__ == "__main__":
    # Load dataset (replace with your dataset)
    from PIL import Image
    import os

    # Example: Replace with actual data
    image_paths = ["path_to_images/image1.jpg", "path_to_images/image2.jpg"]  # Add your image paths
    text_data = ["Sample text 1", "Sample text 2"]  # Add your text data
    labels = [0, 1]  # Add your labels

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    dataset = {
        "train": MultimodalDataset(image_paths, text_data, labels, image_transforms["train"], tokenizer),
        "val": MultimodalDataset(image_paths, text_data, labels, image_transforms["val"], tokenizer),
    }
    dataloaders = {
        phase: DataLoader(dataset[phase], batch_size=4, shuffle=(phase == "train")) for phase in ["train", "val"]
    }

    # Initialize model, loss, optimizer, and scheduler
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = MultimodalModel(img_output_dim=128, text_output_dim=128, final_output_dim=2).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

    # Train and Evaluate
    train(model, dataloaders, criterion, optimizer, scheduler, device, epochs=10)
    evaluate(model, dataloaders["val"], device)


  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'transformers'