In [1]:
import os
import datasets
from transformers import VisionEncoderDecoderModel, AutoFeatureExtractor,AutoTokenizer
os.environ["WANDB_DISABLED"] = "true"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import nltk
try:
    nltk.data.find("tokenizers/punkt")
except (LookupError, OSError):
    nltk.download("punkt", quiet=True)

In [3]:
from transformers import VisionEncoderDecoderModel, AutoTokenizer, AutoFeatureExtractor

image_encoder_model = "google/vit-base-patch16-224-in21k"
text_decode_model = "gpt2"

model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    image_encoder_model, text_decode_model)

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.crossattention.c_attn.bias', 'h.0.crossattention.c_attn.weight', 'h.0.crossattention.c_proj.bias', 'h.0.crossattention.c_proj.weight', 'h.0.crossattention.q_attn.bias', 'h.0.crossattention.q_attn.weight', 'h.0.ln_cross_attn.bias', 'h.0.ln_cross_attn.weight', 'h.1.crossattention.c_attn.bias', 'h.1.crossattention.c_attn.weight', 'h.1.crossattention.c_proj.bias', 'h.1.crossattention.c_proj.weight', 'h.1.crossattention.q_attn.bias', 'h.1.crossattention.q_attn.weight', 'h.1.ln_cross_attn.bias', 'h.1.ln_cross_attn.weight', 'h.10.crossattention.c_attn.bias', 'h.10.crossattention.c_attn.weight', 'h.10.crossattention.c_proj.bias', 'h.10.crossattention.c_proj.weight', 'h.10.crossattention.q_attn.bias', 'h.10.crossattention.q_attn.weight', 'h.10.ln_cross_attn.bias', 'h.10.ln_cross_attn.weight', 'h.11.crossattention.c_attn.bias', 'h.11.crossattention.c_attn.weight', 'h.11.crossat

In [4]:
# image feature extractor
feature_extractor = AutoFeatureExtractor.from_pretrained(image_encoder_model)
# text tokenizer
tokenizer = AutoTokenizer.from_pretrained(text_decode_model)

2024-04-01 09:03:43.559035: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
# GPT2 only has bos/eos tokens but not decoder_start/pad tokens
tokenizer.pad_token = tokenizer.eos_token

# update the model config
model.config.eos_token_id = tokenizer.eos_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

In [6]:
output_dir = "vit-gpt-model"
model.save_pretrained(output_dir)
feature_extractor.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)



('vit-gpt-model/tokenizer_config.json',
 'vit-gpt-model/special_tokens_map.json',
 'vit-gpt-model/vocab.json',
 'vit-gpt-model/merges.txt',
 'vit-gpt-model/added_tokens.json',
 'vit-gpt-model/tokenizer.json')

In [7]:
train_file_path = '/home/rinzler/Github/Image-Text-Matching/data/flickr8k.TrainImages.txt'
test_file_path = '/home/rinzler/Github/Image-Text-Matching/data/flickr8k.TestImages.txt'
validation_file_path = '/home/rinzler/Github/Image-Text-Matching/data/flickr8k.DevImages.txt'
images_directory = '/home/rinzler/Github/Image-Text-Matching/data/images'

In [8]:
# Read your dataset (adjust as needed)
import os
data_train = []
data_val = []
data_test = []
file_paths = [train_file_path, validation_file_path, test_file_path]
store_list = [data_train, data_val, data_test]
inx = -1
for i in file_paths:
    inx=inx+1
    with open(i, 'r') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                store_list[inx].append({
                    'image_path': os.path.join(images_directory+"/", parts[0]),
                    'caption': parts[1],
                    'label': 1 if parts[2] == 'match' else 0
                })
    

In [9]:
import pandas as pd
df_train = pd.DataFrame(data_train)
df_val = pd.DataFrame(data_val)
df_test = pd.DataFrame(data_test)

In [10]:
len(df_train), len(df_val), len(df_test)

(19386, 1164, 1161)

In [11]:
df_train

Unnamed: 0,image_path,caption,label
0,/home/rinzler/Github/Image-Text-Matching/data/...,A black dog and a spotted dog are fighting,1
1,/home/rinzler/Github/Image-Text-Matching/data/...,A black dog and a tri-colored dog playing with...,1
2,/home/rinzler/Github/Image-Text-Matching/data/...,Two dogs on pavement moving toward each other .,1
3,/home/rinzler/Github/Image-Text-Matching/data/...,A tan and black dog opens its mouth for a red ...,0
4,/home/rinzler/Github/Image-Text-Matching/data/...,The little girl is being swung around by her a...,0
...,...,...,...
19381,/home/rinzler/Github/Image-Text-Matching/data/...,A man is doing a wheelie on a mountain bike .,1
19382,/home/rinzler/Github/Image-Text-Matching/data/...,Man on a bicycle riding on only one wheel .,1
19383,/home/rinzler/Github/Image-Text-Matching/data/...,A woman paints a picture on a girl s face .,0
19384,/home/rinzler/Github/Image-Text-Matching/data/...,A cat standing on carpet is interested in a pi...,0


In [12]:
df_train.to_csv('train.csv'), df_val.to_csv('val.csv'), df_test.to_csv('test.csv')

(None, None, None)

In [13]:
from datasets import load_dataset

ds = load_dataset('csv', data_files={'train': 'train.csv', 'validation': 'val.csv', 'test': 'test.csv'})

Generating train split: 19386 examples [00:00, 498606.65 examples/s]
Generating validation split: 1164 examples [00:00, 196980.83 examples/s]
Generating test split: 1161 examples [00:00, 179193.63 examples/s]


In [14]:
ds

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'image_path', 'caption', 'label'],
        num_rows: 19386
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'image_path', 'caption', 'label'],
        num_rows: 1164
    })
    test: Dataset({
        features: ['Unnamed: 0', 'image_path', 'caption', 'label'],
        num_rows: 1161
    })
})

In [15]:
# Rename the column 'Unnamed: 0' to 'index' in each dataset
for split in ds.keys():
    ds[split] = ds[split].rename_column('Unnamed: 0', 'index')

In [16]:
ds

DatasetDict({
    train: Dataset({
        features: ['index', 'image_path', 'caption', 'label'],
        num_rows: 19386
    })
    validation: Dataset({
        features: ['index', 'image_path', 'caption', 'label'],
        num_rows: 1164
    })
    test: Dataset({
        features: ['index', 'image_path', 'caption', 'label'],
        num_rows: 1161
    })
})

In [17]:
# print single example
ds['train'][0]

{'index': 0,
 'image_path': '/home/rinzler/Github/Image-Text-Matching/data/images/1001773457_577c3a7d70.jpg',
 'caption': 'A black dog and a spotted dog are fighting',
 'label': 1}

In [18]:
from PIL import Image

# text preprocessing step
def tokenization_fn(captions, max_target_length):
    """Run tokenization on captions."""
    labels = tokenizer(captions, 
                      padding="max_length", 
                      max_length=max_target_length).input_ids

    return labels

# image preprocessing step
def feature_extraction_fn(image_paths, check_image=True):
    """
    Run feature extraction on images
    If `check_image` is `True`, the examples that fails during `Image.open()` will be caught and discarded.
    Otherwise, an exception will be thrown.
    """

    model_inputs = {}

    if check_image:
        images = []
        to_keep = []
        for image_file in image_paths:
            try:
                img = Image.open(image_file)
                images.append(img)
                to_keep.append(True)
            except Exception:
                to_keep.append(False)
    else:
        images = [Image.open(image_file) for image_file in image_paths]

    encoder_inputs = feature_extractor(images=images, return_tensors="np")

    return encoder_inputs.pixel_values

def preprocess_fn(examples, max_target_length, check_image = True):
    """Run tokenization + image feature extraction"""
    image_paths = examples['image_path']
    captions = examples['caption']    
    
    model_inputs = {}
    # This contains image path column
    model_inputs['labels'] = tokenization_fn(captions, max_target_length)
    model_inputs['pixel_values'] = feature_extraction_fn(image_paths, check_image=check_image)

    return model_inputs

In [19]:
processed_dataset = ds.map(
    function=preprocess_fn,
    batched=True,
    fn_kwargs={"max_target_length": 128},
    remove_columns=ds['train'].column_names
)

Map:   0%|          | 0/19386 [00:00<?, ? examples/s]

Map: 100%|██████████| 19386/19386 [01:20<00:00, 239.99 examples/s]
Map: 100%|██████████| 1164/1164 [00:04<00:00, 242.76 examples/s]
Map: 100%|██████████| 1161/1161 [00:04<00:00, 259.36 examples/s]


In [20]:
processed_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'pixel_values'],
        num_rows: 19386
    })
    validation: Dataset({
        features: ['labels', 'pixel_values'],
        num_rows: 1164
    })
    test: Dataset({
        features: ['labels', 'pixel_values'],
        num_rows: 1161
    })
})

In [21]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    output_dir="./image-captioning-output",
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [22]:
%pip install transformers[torch]
%pip install accelerate -U

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [23]:
%pip install rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [25]:
import evaluate
metric = evaluate.load("rouge")

Downloading builder script: 100%|██████████| 6.27k/6.27k [00:00<00:00, 10.6MB/s]


In [27]:
import numpy as np

ignore_pad_token_for_loss = True


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    if ignore_pad_token_for_loss:
        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds,
                                                     decoded_labels)

    result = metric.compute(predictions=decoded_preds,
                            references=decoded_labels,
                            use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    return result

In [28]:
from transformers import default_data_collator

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=feature_extractor,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=processed_dataset['train'],
    eval_dataset=processed_dataset['validation'],
    data_collator=default_data_collator,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [29]:
trainer.train()

  0%|          | 0/14541 [00:00<?, ?it/s]Could not load library libcudnn_cnn_train.so.8. Error: /usr/local/cuda-11.8/lib64/libcudnn_cnn_train.so.8: undefined symbol: _ZN10cask_cudnn19HardwareInformationC1ERKNS_7SmModelEiff, version libcudnn_cnn_infer.so.8
Could not load library libcudnn_cnn_train.so.8. Error: /usr/local/cuda-11.8/lib64/libcudnn_cnn_train.so.8: undefined symbol: _ZN10cask_cudnn19HardwareInformationC1ERKNS_7SmModelEiff, version libcudnn_cnn_infer.so.8
Could not load library libcudnn_cnn_train.so.8. Error: /usr/local/cuda-11.8/lib64/libcudnn_cnn_train.so.8: undefined symbol: _ZN10cask_cudnn19HardwareInformationC1ERKNS_7SmModelEiff, version libcudnn_cnn_infer.so.8
Could not load library libcudnn_cnn_train.so.8. Error: /usr/local/cuda-11.8/lib64/libcudnn_cnn_train.so.8: undefined symbol: _ZN10cask_cudnn19HardwareInformationC1ERKNS_7SmModelEiff, version libcudnn_cnn_infer.so.8
Could not load library libcudnn_cnn_train.so.8. Error: /usr/local/cuda-11.8/lib64/libcudnn_cnn_trai

RuntimeError: GET was unable to find an engine to execute this computation

In [None]:
trainer.save_model("./image-captioning-output")

In [None]:
tokenizer.save_pretrained("./image-captioning-output")

('./image-captioning-output\\tokenizer_config.json',
 './image-captioning-output\\special_tokens_map.json',
 './image-captioning-output\\vocab.json',
 './image-captioning-output\\merges.txt',
 './image-captioning-output\\added_tokens.json',
 './image-captioning-output\\tokenizer.json')

In [None]:
from transformers import pipeline
image_captioner = pipeline("image-to-text", model="./image-captioning-output")





