In [42]:
from transformers import LayoutLMv3Config, LayoutLMv3Model

# Initializing a LayoutLMv3 microsoft/layoutlmv3-base style configuration
configuration = LayoutLMv3Config()

# Initializing a model (with random weights) from the microsoft/layoutlmv3-base style configuration
model = LayoutLMv3Model(configuration)

# Accessing the model configuration
configuration = model.config

In [1]:
import os
import time
import random

import torch
import pandas as pd
import numpy as np
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score

from dotenv import load_dotenv
from datetime import datetime
from zoneinfo import ZoneInfo
import wandb

from transformers import LayoutLMv3ForSequenceClassification, LayoutLMv3Processor
import pytesseract

import matplotlib.pyplot as plt
from PIL import Image

### Tesseract OCR 설치
**Windows**

1. Tesseract 설치 파일 다운로드.
2. 설치 파일을 실행하여 Tesseract를 설치.
3. 설치 경로를 환경 변수에 추가 (예: C:\Program Files\Tesseract-OCR).

4. 링크 : https://github.com/UB-Mannheim/tesseract/wiki

In [2]:
# %pip install tesseract

In [3]:
#%pip install pytesseract

In [4]:
# wandb 연동
load_dotenv()
api_key = os.getenv('WANDB_API_KEY')

wandb.login(key=api_key)

train_time = datetime.fromtimestamp(time.time(), tz=ZoneInfo("Asia/Seoul")).strftime("%Y%m%d-%H%M%S")
wandb.init(project="competition2-cv", name=f"layoutlmv3-{train_time}")

print(train_time)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: alvlalvl92 (alvlalvl). Use `wandb login --relogin` to force relogin


20240803-170004


In [5]:
# 시드를 고정합니다.
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark = True

In [6]:
# 데이터셋 클래스를 정의합니다.
class ImageDataset(Dataset):
    def __init__(self, csv, path, processor, transform=None): 
        self.df = pd.read_csv(csv).values
        self.path = path
        self.processor = processor
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        name, target = self.df[idx]
        image = Image.open(os.path.join(self.path, name)).convert("RGB")
        if self.transform:
            image = self.transform(image=image)["image"]

        encoded_inputs = self.processor(image, return_tensors="pt", padding="max_length", truncation=True)
        input_ids = encoded_inputs["input_ids"].squeeze()
        attention_mask = encoded_inputs["attention_mask"].squeeze()
        bbox = encoded_inputs["bbox"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "bbox": bbox,
            "labels": torch.tensor(target, dtype=torch.long)
        }


In [7]:
# one epoch 학습을 위한 함수입니다.
def train_one_epoch(loader, model, optimizer, loss_fn, device):
    model.train()
    train_loss = 0
    preds_list = []
    targets_list = []

    pbar = tqdm(loader)
    for batch in pbar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        bbox = batch["bbox"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, bbox=bbox, labels=labels)
        loss = outputs.loss
        preds = outputs.logits

        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        preds_list.extend(preds.argmax(dim=1).detach().cpu().numpy())
        targets_list.extend(labels.detach().cpu().numpy())

        pbar.set_description(f"Loss: {loss.item():.4f}")

    train_loss /= len(loader)
    train_acc = accuracy_score(targets_list, preds_list)
    train_f1 = f1_score(targets_list, preds_list, average='macro')

    ret = {
        "train_loss": train_loss,
        "train_acc": train_acc,
        "train_f1": train_f1,
    }

    wandb.log(ret)
    
    return ret

In [8]:
# device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# data config
data_path = '../data/'

# model config
model_name = 'microsoft/layoutlmv3-base'

# training config
LR = 1e-3
EPOCHS = 1
BATCH_SIZE = 4  # LayoutLMv3는 메모리 사용량이 많으므로 작은 배치 크기를 사용
num_workers = 0

wandb.config.update({
    "learning_rate": LR,
    "architecture": model_name,
    "dataset": "custom-dataset",
    "epochs": EPOCHS,
    "batch_size": BATCH_SIZE,
})


In [9]:
device

device(type='cuda')

In [10]:
# processor 정의
processor = LayoutLMv3Processor.from_pretrained(model_name)

In [11]:
# Dataset 정의
trn_dataset = ImageDataset(
    f"{data_path}train.csv",
    f"{data_path}train/",
    processor=processor
)

tst_dataset = ImageDataset(
    f"{data_path}sample_submission.csv",
    f"{data_path}test/",
    processor=processor
)

print(len(trn_dataset), len(tst_dataset))

1570 3140


In [12]:
# DataLoader 정의
trn_loader = DataLoader(
    trn_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=num_workers,
    pin_memory=True,
    drop_last=False
)
tst_loader = DataLoader(
    tst_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,
    pin_memory=True
)


In [13]:
# model 및 optimizer 정의
model = LayoutLMv3ForSequenceClassification.from_pretrained(model_name, num_labels=17).to(device)
optimizer = Adam(model.parameters(), lr=LR)

Some weights of LayoutLMv3ForSequenceClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# 학습 및 평가 루프
for epoch in range(EPOCHS):
    ret = train_one_epoch(trn_loader, model, optimizer, loss_fn=None, device=device)
    ret['epoch'] = epoch

    log = ""
    for k, v in ret.items():
        log += f"{k}: {v+1:.4f}\n"
    print(log)

Loss: 2.9840: 100%|██████████| 393/393 [09:55<00:00,  1.52s/it]


train_loss: 3.9642
train_acc: 1.0764
train_f1: 1.0660
epoch: 1.0000



In [13]:
preds_list = []

model.eval()
for batch in tqdm(tst_loader):
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    bbox = batch["bbox"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, bbox=bbox)
        preds = outputs.logits

    preds_list.extend(preds.argmax(dim=1).detach().cpu().numpy())

pred_df = pd.DataFrame(tst_dataset.df, columns=['ID', 'target'])
pred_df['target'] = preds_list

sample_submission_df = pd.read_csv(f"{data_path}sample_submission.csv")
assert (sample_submission_df['ID'] == pred_df['ID']).all()

pred_df.to_csv("pred.csv", index=False)

pred_df.head()

# wandb 실행 종료
wandb.finish()


100%|██████████| 785/785 [12:09<00:00,  1.08it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_acc,▁▄▆▇█
train_f1,▁▄▅▇█
train_loss,█▅▃▂▁

0,1
train_acc,0.54395
train_f1,0.5169
train_loss,1.42295


In [15]:
# 예측 결과와 이미지 경로를 포함한 데이터프레임 생성
# pred_df에 'image_path' 열이 있어야 한다고 가정합니다.
pred_df['image_path'] = pred_df['ID'].apply(lambda x: f"{data_path}/images/{x}.jpg")  # 이미지 경로를 수정하세요

# 예측된 클래스와 함께 이미지 시각화
def show_image_with_prediction(image_path, prediction):
    image = Image.open(image_path)
    plt.figure(figsize=(10, 10))
    plt.imshow(image)
    plt.title(f'Predicted Class: {prediction}')
    plt.axis('off')
    plt.show()

# 데이터프레임에서 몇 개의 이미지를 선택하여 시각화
for i in range(min(5, len(pred_df))):  # 예를 들어, 상위 5개 이미지를 표시
    row = pred_df.iloc[i]
    show_image_with_prediction(row['image_path'], row['target'])


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\ej_ja\\OneDrive\\Desktop\\ai_lab\\aistages_2_CV\\upstage-cv-classification-cv11\\data\\images\\0008fdb22ddce0ce.jpg.jpg'