In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import json
import os
from transformers import AutoImageProcessor, AutoTokenizer, VisionEncoderDecoderModel
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
import matplotlib.pyplot as plt

json_file_path = '/content/drive/My Drive/RAI2/capgen_v1.0_train.json'

with open(json_file_path, 'r', encoding='utf-8') as f:
    annotations = json.load(f)

file_path_travel = '/content/drive/My Drive/RAI2/train/train/travel/'
file_path_food = '/content/drive/My Drive/RAI2/train/train/food/'

model_path = 'Natthaphon/thaicapgen-swin-phayathai'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
feature_extractor = AutoImageProcessor.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = VisionEncoderDecoderModel.from_pretrained(model_path).to(device)

image_paths = []
caption_dict = {}

for image_path, captions_list in annotations.items():
    image_path_corrected = image_path.replace('ipu24/', '')
    print(f"Original path: {image_path}, Corrected path: {image_path_corrected}")

    if image_path_corrected.startswith('train/travel'):
        image_path_corrected = image_path_corrected.replace('train/travel/', '')
        full_path = os.path.join(file_path_travel, image_path_corrected)
        if os.path.exists(full_path):
            image_paths.append(full_path)
            caption_dict[full_path] = captions_list
        else:
            print(f"Warning: File {full_path} not found.")
    elif image_path_corrected.startswith('train/food'):
        image_path_corrected = image_path_corrected.replace('train/food/', '')
        full_path = os.path.join(file_path_food, image_path_corrected)
        if os.path.exists(full_path):
            image_paths.append(full_path)
            caption_dict[full_path] = captions_list
        else:
            print(f"Warning: File {full_path} not found.")

print(f"Number of image paths: {len(image_paths)}")
print(f"Number of captions in dict: {len(caption_dict)}")

if len(image_paths) == 0 or len(caption_dict) == 0:
    raise ValueError("No valid image paths or captions found.")

class ImageTextRecognitionDataset(Dataset):
    def __init__(self, image_paths, caption_dict, feature_extractor, tokenizer, image_size=(256, 256)):
        self.image_paths = image_paths
        self.caption_dict = caption_dict
        self.feature_extractor = feature_extractor
        self.tokenizer = tokenizer
        self.image_size = image_size


        self.image_caption_pairs = []
        for image_path in self.image_paths:
            captions_list = self.caption_dict.get(image_path, [])
            print(f"Caption_list : {image_path}")
            print(captions_list[0])
            if captions_list:
                self.image_caption_pairs.append((image_path, captions_list[0]))

    def __len__(self):
        return len(self.image_caption_pairs)

    def __getitem__(self, idx):
        image_path, caption = self.image_caption_pairs[idx]

        image = Image.open(image_path).convert("RGB")

        inputs = self.feature_extractor(images=image, return_tensors="pt")
        pixel_values = inputs['pixel_values'].squeeze(0)

        labels = self.tokenizer(caption, padding="max_length", truncation=True, max_length=50, return_tensors="pt").input_ids.squeeze(0)

        return {"pixel_values": pixel_values, "labels": labels}



dataset = ImageTextRecognitionDataset(image_paths, caption_dict, feature_extractor, tokenizer)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True, num_workers=4)

for batch in dataloader:
    print(batch['pixel_values'].shape)
    print(batch['labels'].shape)
    break


You are using a model of type clip-encoder-decoder to instantiate a model of type vision-encoder-decoder. This is not supported for all configurations of models and can yield errors.
Config of the encoder: <class 'transformers.models.swinv2.modeling_swinv2.Swinv2Model'> is overwritten by shared encoder config: Swinv2Config {
  "_name_or_path": "microsoft/swinv2-large-patch4-window12to16-192to256-22kto1k-ft",
  "architectures": [
    "Swinv2ForImageClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "depths": [
    2,
    2,
    18,
    2
  ],
  "drop_path_rate": 0.1,
  "embed_dim": 192,
  "encoder_stride": 32,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1536,
  "id2label": {
    "0": "tench, Tinca tinca",
    "1": "goldfish, Carassius auratus",
    "2": "great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias",
    "3": "tiger shark, Galeocerdo cuvieri",
    "4": "hammerhead, hammerhead shark",
    "5": "electric ray, cra

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Caption_list : /content/drive/My Drive/RAI2/train/train/food/25505.jpg
แคปหมูทอดกรอบหลายชิ้นวางเรียงซ้อนกันอยู่ในจานสีขาว
Caption_list : /content/drive/My Drive/RAI2/train/train/food/25506.jpg
ถั่วหลายชนิดจำนวนมากบรรจุอยู่ในถุงมีสติกเกอร์เขียนว่า กระยาสารท
Caption_list : /content/drive/My Drive/RAI2/train/train/food/25507.jpg
ขนม มันสำปะหลังกับแป้งมีสองชิ้นมีมะพร้าวอยู่ข้างๆใส่อยู่ในกล่อง
Caption_list : /content/drive/My Drive/RAI2/train/train/food/25508.jpg
เนื้อหั่นเป็นชิ้นๆสีน้ำตาลโรยด้วยพริกไทยดำใส่อยู่ในจาน
Caption_list : /content/drive/My Drive/RAI2/train/train/food/25509.jpg
ถาดสีเทามีไข่ผัดอยู่ด้านบนและมีหน่อไม้สีขาวตั้งอยู่
Caption_list : /content/drive/My Drive/RAI2/train/train/food/25510.jpg
ขนมข้าวแต๋นตั้งอยู่ในถุงพลาสติกมีป้ายสีน้ำตาลและสีเหลือง
Caption_list : /content/drive/My Drive/RAI2/train/train/food/25511.jpg
เนื้อหอยผัดเครื่องแกงมีสีเหลืองผัดกับใบมะกรูดใส่อยู่ในจาน
Caption_list : /content/drive/My Driv

In [None]:
for batch in dataloader:
    pixel_values = batch['pixel_values']
    labels = batch['labels']

    first_label_ids = labels[0]
    first_label_caption = tokenizer.decode(first_label_ids, skip_special_tokens=True)
    print("First label caption:", first_label_caption)
    print("First label ids:", first_label_ids)
    break


First label caption: บริเวณกําแพงแห่งหนึ่ง มีผนังเป็นสีเหลือง มีลวดลายวาดอยู่บนผนัง
First label ids: tensor([   5,   10,  385, 3246, 2583,  471, 5577,   17, 4780,  471, 7529, 3005,
        2472, 5577,    6,    6,    6,    6,    6,    6,    6,    6,    6,    6,
           6,    6,    6,    6,    6,    6,    6,    6,    6,    6,    6,    6,
           6,    6,    6,    6,    6,    6,    6,    6,    6,    6,    6,    6,
           6,    6])


In [None]:
model.save_pretrained('/content/drive/MyDrive/RAI2/thaicapgen_model_hf')
feature_extractor.save_pretrained('/content/drive/MyDrive/RAI2/thaicapgen_feature_extractor')
tokenizer.save_pretrained('/content/drive/MyDrive/RAI2/thaicapgen_tokenizer')


('/content/drive/MyDrive/RAI2/thaicapgen_tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/RAI2/thaicapgen_tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/RAI2/thaicapgen_tokenizer/sentencepiece.bpe.model',
 '/content/drive/MyDrive/RAI2/thaicapgen_tokenizer/added_tokens.json',
 '/content/drive/MyDrive/RAI2/thaicapgen_tokenizer/tokenizer.json')

In [None]:
import torch
from torch.optim import AdamW
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from transformers import get_scheduler
from torch import nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for param in model.parameters():
    param.requires_grad = False


for param in model.decoder.parameters():
    param.requires_grad = True

model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 1
num_training_steps = len(dataloader)
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

def train():
    model.train()
    total_loss = 0

    for batch_idx, batch in enumerate(tqdm(dataloader)):
        pixel_values = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)


        outputs = model(pixel_values=pixel_values, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()


        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

    avg_train_loss = total_loss / len(dataloader)
    print(f"Average Training Loss for epoch {epoch}: {avg_train_loss}")


for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train()


Epoch 1/2


100%|██████████| 1751/1751 [1:35:56<00:00,  3.29s/it]


Average Training Loss for epoch 0: 0.6483955625838242
Epoch 2/2


100%|██████████| 1751/1751 [38:43<00:00,  1.33s/it]

Average Training Loss for epoch 1: 0.5585546250992812





In [None]:
for param in model.parameters():
    param.requires_grad = False

for param in model.decoder.parameters():
    param.requires_grad = False

for param in model.encoder.parameters():
    param.requires_grad = True



def train():
    model.train()
    total_loss = 0

    for batch_idx, batch in enumerate(tqdm(dataloader)):
        pixel_values = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(pixel_values=pixel_values, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

    avg_train_loss = total_loss / len(dataloader)
    print(f"Average Training Loss for epoch {epoch}: {avg_train_loss}")


for epoch in range(1):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train()

Epoch 1/2


  0%|          | 0/1751 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 46.12 MiB is free. Process 13432 has 14.69 GiB memory in use. Of the allocated memory 14.38 GiB is allocated by PyTorch, and 188.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
for batch in dataloader:
    print("Pixel values shape:", batch['pixel_values'].shape)

    print("Labels shape:", batch['labels'].shape)

    first_label = batch['labels'][0]
    print("First label tensor:", first_label)

    caption = tokenizer.decode(first_label, skip_special_tokens=True)
    print("First label caption:", caption)

    break


Pixel values shape: torch.Size([16, 3, 256, 256])
Labels shape: torch.Size([16, 50])
First label tensor: tensor([    5,    10,   495,   547, 18038,  5524,  2472, 16875,  7235, 14480,
         3487,    67,    23,  4499,    17, 13530,     6,     6,     6,     6,
            6,     6,     6,     6,     6,     6,     6,     6,     6,     6,
            6,     6,     6,     6,     6,     6,     6,     6,     6,     6,
            6,     6,     6,     6,     6,     6,     6,     6,     6,     6])
First label caption: รูปเรือหลากสีแปะอยู่บนไม้ไผ่สีน้ําตาลที่วางเรียงกันมีหลังคาเป็นใบไม้


In [None]:
from transformers import AutoTokenizer, AutoImageProcessor
from PIL import Image

def predict(image_path):
    image = Image.open(image_path).convert("RGB")

    inputs = feature_extractor(images=image, return_tensors="pt")
    pixel_values = inputs["pixel_values"].to(device)

    output = model.generate(pixel_values)

    caption = tokenizer.decode(output[0], skip_special_tokens=True)

    return caption

image_path = "/content/drive/My Drive/RAI2/test/test/"
predicted_caption = predict(image_path)
print(f"Predicted Caption: {predicted_caption}")


Predicted Caption: พระพุทธรูปสีทองตั้งอยู่ข้างพระพุทธรูปสีทององค์เล็ก


In [None]:
import os
import pandas as pd
from PIL import Image
from transformers import AutoTokenizer, AutoImageProcessor

def predict(image_path):
    image = Image.open(image_path).convert("RGB")

    inputs = feature_extractor(images=image, return_tensors="pt")
    pixel_values = inputs["pixel_values"].to(device)

    output = model.generate(pixel_values)

    caption = tokenizer.decode(output[0], skip_special_tokens=True)

    return caption

folder_path = "/content/drive/My Drive/RAI2/test/test/"

results = []

for i in range(2000):
    filename = f"{i:05d}.jpg"
    image_path = os.path.join(folder_path, filename)
    print(i)
    if os.path.exists(image_path):
        predicted_caption = predict(image_path)

        results.append({"id": i, "caption": predicted_caption})
    else:
        print(f"Warning: File {image_path} not found.")

df = pd.DataFrame(results)

csv_file_path = '/content/drive/MyDrive/RAI2/predicted_captions.csv'
df.to_csv(csv_file_path, index=False)

print(f"Results saved to {csv_file_path}")


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [None]:
import pandas as pd

csv_file_path = '/content/drive/MyDrive/RAI2/predicted_captions.csv'
df = pd.read_csv(csv_file_path)

df['caption'].fillna('ไม่มี', inplace=True)

df['caption'] = df['caption'].str.replace(" ", "", regex=False)

updated_csv_file_path = '/content/drive/MyDrive/RAI2/predicted_captions_no_spaces.csv'
df.to_csv(updated_csv_file_path, index=False)

print(f"Updated results saved to {updated_csv_file_path}")


Updated results saved to /content/drive/MyDrive/RAI2/predicted_captions_no_spaces.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['caption'].fillna('ไม่มี', inplace=True)


In [None]:
import pandas as pd

sample_submission_path = '/content/drive/MyDrive/RAI2/sample_submission.csv'
df_submission = pd.read_csv(sample_submission_path)

predicted_captions_path = '/content/drive/MyDrive/RAI2/predicted_captions_no_spaces.csv'
df_predictions = pd.read_csv(predicted_captions_path)

print(df_submission.head())
print(df_predictions.head())

caption_dict = dict(zip(df_predictions['id'], df_predictions['caption']))

df_submission['caption'] = df_submission['image_id'].map(caption_dict)

df_submission['caption'].fillna('ไม่มี', inplace=True)

sorted_submission_path = '/content/drive/MyDrive/RAI2/final_sample_submission.csv'
df_submission.to_csv(sorted_submission_path, index=False)

print(f"Updated submission saved to {sorted_submission_path}")


   image_id                                            caption
0      1354  ภาพถ่ายระยะใกล้ของวัตถุทรงกลมสีขาวที่มีคราบสีด...
1      1413  นกตัวหนึ่งที่กําลังเกาะอยู่บนกิ่งไม้อันหนึ่งที...
2      1802  บ้านที่อยู่ติดกับบริเวณริมชายหาดทะเลและมีต้นไม...
3      1243                                                NaN
4       693                                                NaN
   id                                            caption
0   0  วัวสีน้ําตาลตัวหนึ่งกําลังยืนอยู่ข้างถังน้ําสีฟ้า
1   1  ถ้วยสีขาวมีน้ําพริกกะปิใส่อยู่ในถ้วยด้านข้างมี...
2   2  ไม้สีน้ําตาลตั้งอยู่บนพื้นสีน้ําตาลด้านข้างมีต...
3   3  วัวตัวสีน้ําตาลมีเขาสีดํากําลังกินหญ้าอยู่ข้าง...
4   4  ขนมที่เป็นรูปหมีสีน้ําตาลและสีขาวตั้งอยู่ในถ้ว...
Updated submission saved to /content/drive/MyDrive/RAI2/final_sample_submission.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_submission['caption'].fillna('ไม่มี', inplace=True)


In [None]:
import pandas as pd
from PIL import Image
from transformers import AutoTokenizer, AutoImageProcessor

def predict(image_path):
    image = Image.open(image_path).convert("RGB")

    inputs = feature_extractor(images=image, return_tensors="pt")
    pixel_values = inputs["pixel_values"].to(device)

    output = model.generate(pixel_values)

    caption = tokenizer.decode(output[0], skip_special_tokens=True)

    return caption

sample_submission_path = '/content/drive/MyDrive/RAI2/final_sample_submission.csv'
df_submission = pd.read_csv(sample_submission_path)

print("Before Update:")
print(df_submission[df_submission['caption'] == 'ไม่มี'])

missing_caption_ids = df_submission[df_submission['caption'] == 'ไม่มี']['image_id']

predictions = {}

for img_id in missing_caption_ids:
    image_path = f"/content/drive/My Drive/RAI2/test/test/{img_id:05d}.jpg"

    if os.path.exists(image_path):
        caption = predict(image_path)
        predictions[img_id] = caption
    else:
        print(f"Warning: File {image_path} not found.")

df_submission.loc[df_submission['caption'] == 'ไม่มี', 'caption'] = df_submission['image_id'].map(predictions)

print("After Update:")
print(df_submission[df_submission['caption'] == 'ไม่มี'])

updated_submission_path = '/content/drive/MyDrive/RAI2/final_sample_submission.csv'
df_submission.to_csv(updated_submission_path, index=False)

print(f"Updated submission saved to {updated_submission_path}")


Before Update:
      image_id caption
24         966   ไม่มี
26         197   ไม่มี
38         867   ไม่มี
273        170   ไม่มี
322       1690   ไม่มี
532        770   ไม่มี
886       1149   ไม่มี
1020       616   ไม่มี
1067      1403   ไม่มี
1078       448   ไม่มี
1079       123   ไม่มี
1136      1441   ไม่มี
1156      1023   ไม่มี
1293       998   ไม่มี
1415       106   ไม่มี
1447       358   ไม่มี
1455         5   ไม่มี
1495       205   ไม่มี
1816        14   ไม่มี
1915       602   ไม่มี
After Update:
Empty DataFrame
Columns: [image_id, caption]
Index: []
Updated submission saved to /content/drive/MyDrive/RAI2/final_sample_submission.csv
