In [1]:
!ls '/kaggle/input/shopee-product-images'

data_products_id_small.csv  data_products_id_tiny.csv  images


In [2]:
!pip install -q transformers==4.28.0 datasets gdown accelerate rouge_score

[0m

In [3]:
import pandas as pd
import urllib.request
import io
import os
import sys
import requests
import PIL
import datasets


from PIL import Image
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, TrainingArguments, Trainer
from datasets import Dataset, load_dataset, load_from_disk, DatasetDict, load_metric
from transformers import TFVisionEncoderDecoderModel, AutoImageProcessor, AutoTokenizer, VisionEncoderDecoderModel
# from tensorflow.keras.optimizers import Adam
from tqdm import tqdm



In [4]:
img_root_folder = '/kaggle/input/shopee-product-images/images/download'
text_root_folder = '/kaggle/input/shopee-product-images/'

In [5]:
products = pd.read_csv(f'{text_root_folder}/data_products_id_small.csv')
print(products['main_category'].value_counts().to_string())
products = products.loc[products['main_category'] == 'Pakaian Pria']

Olahraga & Outdoor           13050
Elektronik                   11510
Perawatan & Kecantikan       10479
Pakaian Wanita                9333
Perlengkapan Rumah            8932
Aksesoris Fashion             8912
Pakaian Pria                  7888
Ibu & Bayi                    7474
Komputer & Aksesoris          7365
Kesehatan                     6969
Otomotif                      6843
Hobi & Koleksi                6592
Buku & Alat Tulis             6512
Handphone & Aksesoris         5983
Fashion Bayi & Anak           5940
Tas Wanita                    5926
Sepatu Wanita                 5911
Fashion Muslim                5500
Makanan & Minuman             5498
Sepatu Pria                   4500
Tas Pria                      4189
Jam Tangan                    2491
Souvenir & Party Supplies     2477
Fotografi                     1931


In [None]:
def generate_img_path(product_id, image, main_category, sub_category):
    return f"{img_root_folder}/{main_category}/{sub_category}/{image}_tn-{product_id}.jpeg"

products['image_path'] = products.apply(lambda x: generate_img_path(x.product_id, x.image, x.main_category, x.sub_category), axis=1)
products = products.drop(['product_id', 'image', 'shop_name', 'shopid', 'main_category', 'sub_category'], axis=1)

for index, product in tqdm(products.iterrows(), desc='Check Images', total=products.shape[0]):
  if not os.path.exists(product['image_path']):
    print("Not Found")

In [None]:
from sklearn.model_selection import train_test_split

train_val_df, test_df = train_test_split(products, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.1, random_state=42)

train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [None]:
import torch
from torch.utils.data import Dataset
from PIL import Image

class CaptioningDataset(Dataset):
    def __init__(self, df, tokenizer, img_processor, max_target_length=100):
        self.df = df
        self.tokenizer = tokenizer
        self.img_processor = img_processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = self.df['image_path'][idx]
        title = self.df['name'][idx].lower()

        image = Image.open(img_path).convert("RGB")
        pixel_values = self.img_processor(image, return_tensors="pt").pixel_values

        encoded_title = self.tokenizer(title, padding="max_length", max_length=self.max_target_length, truncation=True)

        labels = [label if label != self.tokenizer.pad_token_id else -100 for label in encoded_title.input_ids]

        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

In [None]:
# def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
#     outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
#     return outputs

# GPT2Tokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens

In [None]:
encoder_pretrained = 'google/vit-base-patch16-224'
decoder_pretrained = 'indolem/indobert-base-uncased'

image_processor = AutoImageProcessor.from_pretrained(encoder_pretrained)
tokenizer = AutoTokenizer.from_pretrained(decoder_pretrained)

#gpt2
# tokenizer.pad_token = tokenizer.unk_token

#bert

In [None]:
train_dataset = CaptioningDataset(df=train_df, tokenizer=tokenizer, img_processor=image_processor)
eval_dataset = CaptioningDataset(df=val_df, tokenizer=tokenizer, img_processor=image_processor)
test_dataset = CaptioningDataset(df=test_df, tokenizer=tokenizer, img_processor=image_processor)

In [None]:
print("Number of training examples:", len(train_dataset))
print("Number of validation examples:", len(eval_dataset))

encoding = train_dataset[0]
for k,v in encoding.items():
  print(k, v.shape)

labels = encoding['labels']
print(labels)

labels[labels == -100] = tokenizer.pad_token_id
label_str = tokenizer.decode(labels, skip_special_tokens=True)
print('Decoded Label:', label_str)

In [None]:
image = Image.open(train_df['image_path'][0]).convert("RGB")
print('Label: '+train_df['name'][0])
image

In [None]:
rouge = load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [None]:
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(encoder_pretrained, decoder_pretrained)

#gpt2
# model.config.decoder_start_token_id = tokenizer.bos_token_id
# model.config.pad_token_id = tokenizer.pad_token_id
# model.config.vocab_size = model.config.decoder.vocab_size
# model.config.eos_token_id = tokenizer.eos_token_id

#bert
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size

model.config.max_length = 100
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

In [None]:
from transformers import default_data_collator, EarlyStoppingCallback
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

early_stop = EarlyStoppingCallback(early_stopping_patience=3)

training_args = Seq2SeqTrainingArguments(
    num_train_epochs=100,
    predict_with_generate=True,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    fp16=True, 
    output_dir=".",
    overwrite_output_dir=True,
    save_total_limit=1,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="rouge2_fmeasure"
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=image_processor,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=default_data_collator,
    callbacks=[early_stop]
)

trainer.train()

In [None]:
model.save_pretrained('perawatan-kecantikan')

In [None]:
trainer.predict(test_dataset=test_dataset)

In [None]:
import random

p_id = random.randrange(0, len(test_df))
image = Image.open(test_df['image_path'][p_id]).convert("RGB")
title = test_df['name'][p_id]

pixel_values = image_processor(image, return_tensors="pt").pixel_values.to(torch.device("cuda"))
labels = tokenizer(title, return_tensors="pt").input_ids.to(torch.device("cuda"))

print(title)
image

In [None]:
generated_ids = model.generate(pixel_values, num_return_sequences=3)
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
# print(generated_ids)
# print(generated_text)

for t in generated_text:
    print(t)
    print()    