In [1]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json

def preprocess_coco_captions(tokenizer, captions_json_path):
    with open(captions_json_path, "r") as f:
        data = json.load(f)

    input_captions = []
    target_captions = []

    for annotation in data["annotations"]:
        image_id = annotation["image_id"]
        caption = annotation["caption"]

        # Prepend the special token [CLS] for input captions
        input_caption = f"[CLS] {caption}"
        target_caption = f"{caption} [SEP]"

        input_captions.append({"image_id": image_id, "caption": input_caption})
        target_captions.append({"image_id": image_id, "caption": target_caption})

    # Tokenize the captions
    input_captions = tokenizer(
        [item["caption"] for item in input_captions],
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=64,
    )

    target_captions = tokenizer(
        [item["caption"] for item in target_captions],
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=64,
    )

    return input_captions, target_captions


In [3]:
import json
from transformers import AutoTokenizer

def process_json(json_data):
    # Parse the JSON data
    annotations = json.loads(json_data)["annotations"]
    
    # Extract input and target captions
    processed_annotations = [
        {
            "id": annotation["id"],
            "image_id": annotation["image_id"],
            "caption": annotation["caption"]
        }
        for annotation in annotations
    ]

    return processed_annotations

def preprocess_coco_captions(tokenizer, json_path):
    # Read the JSON file
    with open(json_path, 'r') as file:
        json_data = json.load(file)

    # Extract captions from the JSON data
    annotations = process_json(json.dumps(json_data))

    # Extract input and target captions
    input_captions = [f"{annotation['id']} {annotation['image_id']}" for annotation in annotations]
    target_captions = [annotation['caption'] for annotation in annotations]

    # Add a new pad token to the tokenizer
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    # Tokenize input and target captions
    input_tokenized = tokenizer(input_captions, return_tensors="pt", padding=True, truncation=True, max_length=64)
    target_tokenized = tokenizer(target_captions, return_tensors="pt", padding=True, truncation=True, max_length=64)

    return input_tokenized, target_tokenized

# Example usage:
# Assuming you have a tokenizer initialized, replace 'your_tokenizer_name' with the actual tokenizer name.
# tokenizer = AutoTokenizer.from_pretrained('your_tokenizer_name')
# input_captions, target_captions = 


In [4]:

input_captions, target_captions = preprocess_coco_captions(tokenizer, "C:/Users/IDAC PC/Desktop/UtkuThesis/ImageCaptioning/datasets/coco2017/annotations/captions_train2017.json")

In [18]:
target_captions

{'input_ids': tensor([[   32, 17026, 30069,  ..., 50257, 50257, 50257],
        [   32,  2119,   351,  ..., 50257, 50257, 50257],
        [   32,  1097,   326,  ..., 50257, 50257, 50257],
        ...,
        [ 7571,  1466,  1650,  ..., 50257, 50257, 50257],
        [12256, 23648,   351,  ..., 50257, 50257, 50257],
        [   32,  8073,  7480,  ..., 50257, 50257, 50257]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [5]:
from transformers import GPT2LMHeadModel, GPT2Config, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

config = GPT2Config.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2", config=config)
train_dataset = TextDataset(input_captions, target_captions, tokenizer)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="output",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

trainer.train()



TypeError: stat: path should be string, bytes, os.PathLike or integer, not BatchEncoding

In [6]:
import torch
from PIL import Image
from yolov5.utils.general import non_max_suppression

def detect_objects(image_path, model, device="cuda"):
    model.eval()
    img = Image.open(image_path)
    img_tensor = torch.from_numpy(img).unsqueeze(0).to(device)

    with torch.no_grad():
        pred = model(img_tensor)
        pred = non_max_suppression(pred)[0]

    detected_objects = [{"class": int(obj[5]), "conf": obj[4].item()} for obj in pred]
    return detected_objects

ImportError: cannot import name 'TryExcept' from 'utils' (unknown location)

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

def generate_caption(detected_objects, model, tokenizer, device="cuda"):
    input_text = " ".join([f"[{obj['class']}]({obj['conf']:.2f})" for obj in detected_objects])
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(input_ids, max_length=50, num_return_sequences=1)

    caption = tokenizer.decode(output[0], skip_special_tokens=True)
    return caption

In [None]:
model1.load_weights