In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Install required libraries
!pip install transformers torch torchvision tensorflow pandas pillow nltk

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

In [3]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.layers import Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
# Set paths
base_path = '/content/drive/MyDrive/dataset'
image_folder = os.path.join(base_path, 'images')
train_csv = os.path.join(base_path, 'data_train.csv')
eval_csv = os.path.join(base_path, 'data_eval.csv')

In [5]:
# Load and preprocess data
train_df = pd.read_csv(train_csv)
eval_df = pd.read_csv(eval_csv)

In [22]:
# Take a smaller portion of the dataset
train_df = train_df.sample(n=200, random_state=42)
eval_df = eval_df.sample(n=50, random_state=42)

In [23]:
# Load pre-trained models
vgg_model = VGG16(weights='imagenet', include_top=False)
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [24]:
# Image preprocessing function
def preprocess_image(image_path):
    try:
        if not image_path.lower().endswith('.png'):
            image_path += '.png'
        img = load_img(image_path, target_size=(224, 224))
        img_array = img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = preprocess_input(img_array)
        return img_array
    except FileNotFoundError:
        print(f"Image not found: {image_path}")
        return None
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return None

In [25]:
# Extract image features
def extract_image_features(image_path):
    img_array = preprocess_image(image_path)
    if img_array is None:
        return np.zeros((7 * 7 * 512,))  # Return zero vector if image is not found
    features = vgg_model.predict(img_array)
    return features.flatten()

In [26]:
# Question preprocessing function
def preprocess_question(question):
    inputs = bert_tokenizer(question, return_tensors='tf', padding=True, truncation=True, max_length=64)
    outputs = bert_model(inputs)
    return tf.squeeze(outputs.last_hidden_state[:, 0, :])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 599ms/step


In [27]:
# Load answer space
def load_answer_space(file_path):
    with open(file_path, 'r') as f:
        answer_space = [line.strip() for line in f]
    if '<unk>' not in answer_space:
        answer_space.append('<unk>')
    return answer_space

answer_space = load_answer_space(os.path.join(base_path, 'answer_space.txt'))
num_answers = len(answer_space)
answer_to_idx = {ans: idx for idx, ans in enumerate(answer_space)}

In [28]:
# Create VQA model
def create_vqa_model(image_features_shape, question_features_shape, num_answers):
    image_input = tf.keras.Input(shape=image_features_shape)
    question_input = tf.keras.Input(shape=question_features_shape)

    combined = Concatenate()([image_input, question_input])

    x = Dense(512, activation='relu')(combined)
    x = Dropout(0.5)(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)

    output = Dense(num_answers, activation='softmax')(x)

    model = Model(inputs=[image_input, question_input], outputs=output)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    return model

In [29]:

# Check images function
def check_images(df):
    total_images = len(df)
    missing_images = 0
    for _, row in df.iterrows():
        img_path = os.path.join(image_folder, row['image_id'])
        if not img_path.lower().endswith('.png'):
            img_path += '.png'
        if not os.path.exists(img_path):
            missing_images += 1
            print(f"Missing image: {img_path}")

    print(f"Total images: {total_images}")
    print(f"Missing images: {missing_images}")
    print(f"Percentage of missing images: {missing_images/total_images*100:.2f}%")

In [30]:
# Check images before training
print("Checking training images:")
check_images(train_df)
print("\nChecking validation images:")
check_images(eval_df)

Checking training images:
Total images: 200
Missing images: 0
Percentage of missing images: 0.00%

Checking validation images:
Total images: 50
Missing images: 0
Percentage of missing images: 0.00%
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 630ms/step


In [32]:
# Create dataset
def create_dataset(df, batch_size=32):
    def generator():
        for _, row in df.iterrows():
            img_path = os.path.join(image_folder, row['image_id'])
            if not img_path.lower().endswith('.png'):
                img_path += '.png'

            img_features = extract_image_features(img_path)
            if img_features is not None:
                question_features = preprocess_question(row['question'])

                answer = row['answer']
                answer_idx = answer_to_idx.get(answer, answer_to_idx['<unk>'])
                answer_vector = tf.keras.utils.to_categorical(answer_idx, num_classes=num_answers)

                yield (img_features, question_features), answer_vector

    return tf.data.Dataset.from_generator(
        generator,
        output_signature=(
            (tf.TensorSpec(shape=(7*7*512,), dtype=tf.float32),
             tf.TensorSpec(shape=(768,), dtype=tf.float32)),
            tf.TensorSpec(shape=(num_answers,), dtype=tf.float32)
        )
    ).batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [33]:
# Create and train the model
image_features_shape = (7 * 7 * 512,)  # VGG16 output shape
question_features_shape = (768,)  # BERT output shape

vqa_model = create_vqa_model(image_features_shape, question_features_shape, num_answers)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 555ms/step


In [34]:
# Split train data into train and validation
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [37]:
# Create datasets
batch_size = 16
train_dataset = create_dataset(train_df, batch_size)
val_dataset = create_dataset(val_df, batch_size)

In [38]:
# Train the model
steps_per_epoch = len(train_df) // batch_size
validation_steps = len(val_df) // batch_size

history = vqa_model.fit(
    train_dataset,
    epochs=3,
    validation_data=val_dataset,
    steps_per_epoch=steps_per_epoch,
    validation_steps=validation_steps
)

# Save the model
vqa_model.save('/content/drive/MyDrive/vqa_model.h5')

Epoch 1/3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 965ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 982ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 921ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 989ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 931ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 944ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 895ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 578ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 562ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 544ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m



In [39]:
# Inference function
def answer_question(model, image_path, question, true_answer=None):
    if not image_path.lower().endswith('.png'):
        image_path += '.png'
    image_features = extract_image_features(image_path)
    question_features = preprocess_question(question)

    prediction = model.predict([np.expand_dims(image_features, axis=0), np.expand_dims(question_features, axis=0)])
    predicted_idx = np.argmax(prediction[0])
    predicted_answer = answer_space[predicted_idx]

    bleu_score = None
    if true_answer is not None:
        reference = nltk.word_tokenize(true_answer.lower())
        candidate = nltk.word_tokenize(predicted_answer.lower())
        bleu_score = sentence_bleu([reference], candidate)

    return predicted_answer, bleu_score

In [40]:

# Evaluation function
def evaluate_model(model, eval_df):
    eval_dataset = create_dataset(eval_df, batch_size=16)
    steps = len(eval_df) // 16

    results = model.evaluate(eval_dataset, steps=steps)
    print(f"Evaluation Loss: {results[0]}, Accuracy: {results[1]}")

    # Calculate BLEU score
    bleu_scores = []
    for _, row in eval_df.iterrows():
        image_path = os.path.join(image_folder, row['image_id'])
        question = row['question']
        true_answer = row['answer']

        predicted_answer, bleu_score = answer_question(model, image_path, question, true_answer)
        if bleu_score is not None:
            bleu_scores.append(bleu_score)

    avg_bleu_score = sum(bleu_scores) / len(bleu_scores)
    print(f"Average BLEU Score: {avg_bleu_score:.4f}")


In [41]:
# Evaluate the model
print("Evaluating the model:")
evaluate_model(vqa_model, eval_df)

Evaluating the model:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 941ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 907ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 872ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 697ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 544ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 551ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 557ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 554ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 565ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 567ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 553ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━