In [1]:
import pandas as pd
from PIL import Image
from evaluate import load
import glob, os, uuid, json, torch
from datasets import load_dataset, Dataset
from transformers import TrainingArguments, Trainer
from transformers import AutoProcessor, AutoModelForCausalLM, pipeline

In [2]:
print("TORCH VERSION:", torch.__version__)
print("CUDA AVAILABLE:", torch.cuda.is_available())

TORCH VERSION: 1.13.1+cpu
CUDA AVAILABLE: False


In [3]:
def create_Dataset(data_dir = 'data/captioning-v2'):
    all_files = glob.glob(f'{data_dir}/*/*.*')
    image_files = [f for f in all_files if not f.endswith('.json')]
    image_files = [file.replace('\\', '/') for file in image_files]
    json_files = [f.split('.')[0] + '.json' for f in image_files]

    for image_file, json_file in zip(image_files, json_files):
        if os.path.exists(image_file) and os.path.exists(json_file):
            if image_file.endswith('.jpg') or image_file.endswith('.png') or image_file.endswith('.jpeg') or image_file.endswith('.JPG') or image_file.endswith('.PNG') or image_file.endswith('.JPEG'):
                file_dir, _ = os.path.split(image_file)
                rand_name = str(uuid.uuid4())
                new_image_file = os.path.join(file_dir, rand_name + '.jpg')
                new_json_file = os.path.join(file_dir, rand_name + '.json')

                os.rename(image_file, new_image_file)
                os.rename(json_file, new_json_file)
                
# create_Dataset()

def create_hf_datase(data_dir = 'data/captioning-v2'):
    all_files = glob.glob(f'{data_dir}/*/*.*')
    image_files = [f for f in all_files if not f.endswith('.json')]
    image_files = [file.replace('\\', '/') for file in image_files]
    json_files = [f.split('.')[0] + '.json' for f in image_files]


    dataset = {}
    dataset['image'] = []
    dataset['text'] = []
    for image_file, json_file in zip(image_files, json_files):
        if os.path.exists(image_file) and os.path.exists(json_file):
            with open(json_file, 'r') as f:
                json_data = json.load(f)

            caption = json_data['shapes'][0]['label']
            image = Image.open(image_file)
            image = image.convert('RGB')

            dataset['image'].append(image)
            dataset['text'].append(caption)

    dataset = Dataset.from_dict(dataset)
    return dataset

In [4]:
dataset = create_hf_datase()
dataset

Dataset({
    features: ['image', 'text'],
    num_rows: 335
})

In [5]:
checkpoint = "microsoft/git-base"
processor = AutoProcessor.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint)

In [7]:
def transforms(example_batch):
    images = [x for x in example_batch["image"]]
    captions = [x for x in example_batch["text"]]
    inputs = processor(images=images, text=captions, padding="max_length")
    inputs.update({"labels": inputs["input_ids"]})
    return inputs

dataset.set_transform(transforms)

In [8]:
wer = load("wer")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predicted = logits.argmax(-1)
    decoded_labels = processor.batch_decode(labels, skip_special_tokens=True)
    decoded_predictions = processor.batch_decode(predicted, skip_special_tokens=True)
    wer_score = wer.compute(predictions=decoded_predictions, references=decoded_labels)
    return {"wer_score": wer_score}

In [11]:
training_args = TrainingArguments(
                                output_dir="models/sinhala-book-captioning-repo",
                                learning_rate=5e-5,
                                num_train_epochs=15,
                                fp16=True if torch.cuda.is_available() else False,
                                per_device_train_batch_size=1,
                                per_device_eval_batch_size=1,
                                gradient_accumulation_steps=2,
                                save_total_limit=3,
                                evaluation_strategy="steps",
                                eval_steps=50,
                                save_strategy="steps",
                                save_steps=50,
                                logging_steps=10,
                                remove_unused_columns=False,
                                push_to_hub=True,
                                label_names=["labels"],
                                load_best_model_at_end=True,
                                )
trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=dataset,
                eval_dataset=dataset.train_test_split(test_size=0.2)["test"],
                compute_metrics=compute_metrics,
                )

trainer.train()

  0%|          | 0/2505 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [12]:
trainer.push_to_hub("sinhala-book-captioning")

pytorch_model.bin:   0%|          | 0.00/707M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

'https://huggingface.co/thirosh0520/sinhala-book-captioning-repo/tree/main/'

# Evaluation

In [13]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained("models/sinhala-book-captioning-repo", use_auth_token=True)
processor = AutoProcessor.from_pretrained("microsoft/git-base", use_auth_token=True)
model.to(device)

print("MODEL LOADED")



MODEL LOADED


In [14]:
def preprocess_inference_image(image):
    if isinstance(image, str):
        image = Image.open(image)
        image = image.convert('RGB')
    inputs = processor(images=image, return_tensors="pt").to(device)
    generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=5)
    generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_caption

In [15]:
predictions = [preprocess_inference_image(image) for image in dataset["image"]]
truth = [caption for caption in dataset["text"]]

df_results = pd.DataFrame({"predictions": predictions, "truth": truth})
df_results.to_csv("results.csv", index=False)

KeyError: 'text'

# Inference Org

In [16]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained("models/sinhala-book-captioning-repo", use_auth_token=True)
processor = AutoProcessor.from_pretrained("microsoft/git-base", use_auth_token=True)
model.to(device)

image_path = 'data/captioning-v2/herons and fish/2d99c093-3383-49d1-980e-bac67301e367.jpg'
image = Image.open(image_path)
image = image.convert('RGB')

inputs = processor(images=image, return_tensors="pt").to(device)
generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=5)
generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

# visualize the image using PIL
display(image)
# print the caption generated by the model
print("Generated caption:", generated_caption)

FileNotFoundError: [Errno 2] No such file or directory: 'data/captioning-v2/herons and fish/2d99c093-3383-49d1-980e-bac67301e367.jpg'