# OCR

In [1]:
import gc
import warnings
import torch

warnings.filterwarnings("ignore", category=FutureWarning)

gc.collect()
torch.cuda.empty_cache()

# T5 Tokenizer 적용
- 적용할지 안할지 결정 

In [3]:
import pandas as pd
import ast
import torch
from transformers import MT5Tokenizer, MT5ForConditionalGeneration
from tqdm import tqdm

# mT5 모델과 토크나이저 불러오기
model_name = "google/mt5-small"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

# GPU 사용 가능 시 GPU 사용
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 텍스트 교정 함수
def correct_text(text):
    input_text = "correct: " + text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True, padding="max_length").to(device)
    
    outputs = model.generate(**inputs, max_length=512, num_beams=4, early_stopping=True)
    corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return corrected_text

# CSV 파일 읽기
df = pd.read_csv("./image_texts.csv")

# 교정된 텍스트를 저장할 새로운 컬럼 추가
df['corrected_texts'] = ''

# 텍스트 교정
for index, row in tqdm(df.iterrows(), total=len(df), desc="Correcting texts"):
    try:
        texts = ast.literal_eval(row['texts'])  # 문자열을 리스트로 변환
        corrected_texts = [correct_text(text) for text in texts]
        df.at[index, 'corrected_texts'] = str(corrected_texts)  # 리스트를 문자열로 저장
    except Exception as e:
        print(f"Error correcting text for index {index}: {e}")

# 교정된 텍스트 출력
print(df[['id', 'image', 'corrected_texts']])

# 교정된 결과를 CSV 파일로 저장
df.to_csv("corrected_train_texts.csv", index=False, encoding='utf-8-sig')

Correcting texts: 100%|██████████| 1570/1570 [36:54<00:00,  1.41s/it] 

        id                 image  \
0        0  5eb8d197d228609e.jpg   
1        1  716c2fced083a6a6.jpg   
2        2  37f9414beea68229.jpg   
3        3  c8ef1b2fdb8dbace.jpg   
4        4  4606a9ccbc65f3e5.jpg   
...    ...                   ...   
1565  1565  d9e230e42838eb4f.jpg   
1566  1566  713c484c86197d5b.jpg   
1567  1567  9dcfef27d51c3e5e.jpg   
1568  1568  43965cc70d7d14e0.jpg   
1569  1569  b1dad2db5c2de2da.jpg   

                                        corrected_texts  
0                      ['<extra_id_0>', '<extra_id_0>']  
1     ['<extra_id_0>', '<extra_id_0>', '<extra_id_0>...  
2     ['<extra_id_0>', '<extra_id_0>', '<extra_id_0>...  
3     ['<extra_id_0>', '<extra_id_0>', '<extra_id_0>...  
4     ['<extra_id_0>', '<extra_id_0>', '<extra_id_0>...  
...                                                 ...  
1565  ['<extra_id_0>', '<extra_id_0>', '<extra_id_0>...  
1566                   ['<extra_id_0>', '<extra_id_0>']  
1567  ['<extra_id_0>', '<extra_id_0>', '<extr




# OCR + Swin T

In [4]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from transformers import AutoTokenizer, AutoModel
from timm import create_model
import cv2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import easyocr
from PIL import Image
from textblob import TextBlob
from functools import lru_cache
from tqdm import tqdm
import gc
import os

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

gc.collect()
torch.cuda.empty_cache()

# os.environ['TESSDATA_PREFIX'] = '../'
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'  # CUDA_LAUNCH_BLOCKING 설정


In [5]:
# CSV 파일에서 텍스트 데이터를 로드하는 함수
def load_text_from_csv(csv_path):
    df = pd.read_csv(csv_path)
    return {row['image']: ' '.join(eval(row['texts'])) for _, row in df.iterrows()}

# 멀티모달 데이터셋 클래스
class MultimodalDataset(Dataset):
    def __init__(self, df, image_dir, csv_text_data, transform=None, tokenizer=None, max_len=512):
        self.df = df
        self.image_dir = image_dir
        self.transform = transform
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.csv_text_data = csv_text_data

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_name = self.df.iloc[idx, 0]
        img_path = f"{self.image_dir}/{img_name}"
        
        try:
            image = Image.open(img_path).convert('L')
            
            if self.transform:
                image = self.transform(image)
            
            text = self.csv_text_data.get(img_name, "")
            
            encoding = self.tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=self.max_len,
                return_token_type_ids=False,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt',
            )
            
            return {
                'image': image,
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'labels': torch.tensor(self.df.iloc[idx, 1], dtype=torch.long) if 'target' in self.df.columns else torch.tensor(0),
                'image_id': img_name,
                'extracted_text': text
            }
        except Exception as e:
            print(f"Error processing item {idx}: {e}")
            # 오류 발생 시 더미 데이터 반환
            return {
                'image': torch.zeros((1, 224, 224)),
                'input_ids': torch.zeros(self.max_len, dtype=torch.long),
                'attention_mask': torch.zeros(self.max_len, dtype=torch.long),
                'labels': torch.tensor(0, dtype=torch.long),
                'image_id': img_name,
                'extracted_text': ""
            }

# 멀티모달 모델 클래스
class MultimodalModel(nn.Module):
    def __init__(self, num_classes):
        super(MultimodalModel, self).__init__()
        self.swin_b = create_model('swin_base_patch4_window7_224', pretrained=True, num_classes=0, in_chans=1)
        self.bert = AutoModel.from_pretrained('klue/bert-base')
        
        self.image_proj = nn.Linear(self.swin_b.num_features, 512)
        self.text_proj = nn.Linear(self.bert.config.hidden_size, 512)
        self.attention = nn.MultiheadAttention(embed_dim=512, num_heads=8)
        self.fc = nn.Linear(1024, num_classes)
        self.dropout = nn.Dropout(0.1)
                        
    def to(self, device):
        super().to(device)
        self.swin_b = self.swin_b.to(device)
        self.bert = self.bert.to(device)
        return self
        
    def forward(self, image, input_ids, attention_mask):
        image_features = self.swin_b(image)
        image_features = self.image_proj(image_features)

        text_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_features = text_output.last_hidden_state[:, 0, :]
        text_features = self.text_proj(text_features)
        
        text_length = attention_mask.sum(dim=1).float() / attention_mask.shape[1]
        text_weight = text_length.unsqueeze(1)
        
        weighted_text_features = text_features * text_weight
        
        attended_features, _ = self.attention(image_features.unsqueeze(0), 
                                              weighted_text_features.unsqueeze(0), 
                                              weighted_text_features.unsqueeze(0))
        attended_features = attended_features.squeeze(0)
        
        combined_features = torch.cat((image_features, attended_features), dim=1)
        combined_features = self.dropout(combined_features)
        
        output = self.fc(combined_features)
        return output

# 학습 함수
def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in tqdm(loader, desc="Training"):
        optimizer.zero_grad(set_to_none=True)  # 메모리 사용량 최적화
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)
        images = batch['image'].to(device, non_blocking=True)
        labels = batch['labels'].to(device, non_blocking=True)
        
        with torch.autocast(device_type='cuda', dtype=torch.float16):
            outputs = model(images, input_ids, attention_mask)
            loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    return total_loss / len(loader)

# 평가 함수
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device, non_blocking=True)
            attention_mask = batch['attention_mask'].to(device, non_blocking=True)
            images = batch['image'].to(device, non_blocking=True)
            labels = batch['labels'].to(device, non_blocking=True)
            
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                outputs = model(images, input_ids, attention_mask)
                loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(loader)
    f1 = f1_score(all_labels, all_predictions, average='weighted')
    
    return avg_loss, f1

class EarlyStopping:
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
        self.best_f1 = -np.Inf

    def __call__(self, val_loss, f1_score, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, f1_score, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, f1_score, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, f1_score, model):
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). '
                            f'F1 score: {f1_score:.6f}. Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss
        self.best_f1 = f1_score

In [7]:
if __name__ == "__main__":
    #torch.multiprocessing.set_start_method('spawn')
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # 데이터 경로 설정
    data_path = '../data/'
    
    # PaddleOCR로 추출한 텍스트 데이터 로드
    csv_path = './image_texts.csv'
    csv_text_data = load_text_from_csv(csv_path)

    # 데이터 로드 및 분할
    df = pd.read_csv(f"{data_path}/train_correct_labeling.csv")
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['target'])
    test_df = pd.read_csv(f"{data_path}/sample_submission.csv")

    # 토크나이저 및 변환 준비
    tokenizer = AutoTokenizer.from_pretrained('klue/bert-base')
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485], std=[0.229])  # 그레이스케일 이미지에 맞는 값 사용
    ])

    # 데이터셋 및 데이터로더 준비
    train_dataset = MultimodalDataset(train_df, f"{data_path}/train_preprocessed", csv_text_data, transform, tokenizer)
    val_dataset = MultimodalDataset(val_df, f"{data_path}/train_preprocessed", csv_text_data, transform, tokenizer)
    test_dataset = MultimodalDataset(test_df, f"{data_path}/test_preprocessed", csv_text_data, transform, tokenizer)

    BATCH_SIZE = 32
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, persistent_workers=True, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, persistent_workers=True, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, persistent_workers=True, pin_memory=True)

    # 모델, 손실 함수, 옵티마이저 준비
    num_classes = len(df['target'].unique())
    model = MultimodalModel(num_classes).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=30)
    
    # 조기 종료 설정
    early_stopping = EarlyStopping(patience=7, verbose=True, delta=0.001, path='best_model.pth')

    num_epochs = 50
    best_f1 = 0
    for epoch in range(num_epochs):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        val_loss, val_f1 = evaluate(model, val_loader, criterion, device)
        
        scheduler.step()  # 학습률 조정
        
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"Train Loss: {train_loss:.4f}")
        print(f"Val Loss: {val_loss:.4f}, Val F1 Score: {val_f1:.4f}")
        
        # 조기 종료 체크 (validation 에러 기준)
        early_stopping(val_loss, val_f1, model)
        if early_stopping.early_stop:
            print(f"Early stopping. Best validation loss: {early_stopping.val_loss_min:.6f}, "
                  f"Best F1 score: {early_stopping.best_f1:.6f}")
            break
        
        # 최고의 F1 스코어 업데이트 (별도로 추적)
        if val_f1 > best_f1:
            best_f1 = val_f1
            print(f"New best F1 score: {best_f1:.4f}")

    # 모델 저장
    model.load_state_dict(torch.load("best_model.pth"))
    model.eval()

    print("Training completed.")

Using device: cuda


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
Training: 100%|██████████| 40/40 [00:15<00:00,  2.54it/s]
Evaluating: 100%|██████████| 10/10 [00:01<00:00,  7.26it/s]


Epoch 1/50
Train Loss: 2.0785
Val Loss: 0.9926, Val F1 Score: 0.6583
Validation loss decreased (inf --> 0.992551). F1 score: 0.658349. Saving model ...
New best F1 score: 0.6583


Training: 100%|██████████| 40/40 [00:15<00:00,  2.64it/s]
Evaluating: 100%|██████████| 10/10 [00:01<00:00,  8.15it/s]


Epoch 2/50
Train Loss: 0.6815
Val Loss: 0.3326, Val F1 Score: 0.8652
Validation loss decreased (0.992551 --> 0.332634). F1 score: 0.865195. Saving model ...
New best F1 score: 0.8652


Training: 100%|██████████| 40/40 [00:15<00:00,  2.64it/s]
Evaluating: 100%|██████████| 10/10 [00:01<00:00,  8.18it/s]


Epoch 3/50
Train Loss: 0.3374
Val Loss: 0.2425, Val F1 Score: 0.9067
Validation loss decreased (0.332634 --> 0.242477). F1 score: 0.906682. Saving model ...
New best F1 score: 0.9067


Training: 100%|██████████| 40/40 [00:15<00:00,  2.63it/s]
Evaluating: 100%|██████████| 10/10 [00:01<00:00,  8.09it/s]


Epoch 4/50
Train Loss: 0.2082
Val Loss: 0.2088, Val F1 Score: 0.9091
Validation loss decreased (0.242477 --> 0.208772). F1 score: 0.909058. Saving model ...
New best F1 score: 0.9091


Training: 100%|██████████| 40/40 [00:15<00:00,  2.62it/s]
Evaluating: 100%|██████████| 10/10 [00:01<00:00,  8.20it/s]


Epoch 5/50
Train Loss: 0.1403
Val Loss: 0.1681, Val F1 Score: 0.9244
Validation loss decreased (0.208772 --> 0.168056). F1 score: 0.924386. Saving model ...
New best F1 score: 0.9244


Training: 100%|██████████| 40/40 [00:15<00:00,  2.63it/s]
Evaluating: 100%|██████████| 10/10 [00:01<00:00,  8.12it/s]


Epoch 6/50
Train Loss: 0.0982
Val Loss: 0.1830, Val F1 Score: 0.9185
EarlyStopping counter: 1 out of 7


Training: 100%|██████████| 40/40 [00:15<00:00,  2.62it/s]
Evaluating: 100%|██████████| 10/10 [00:01<00:00,  8.15it/s]


Epoch 7/50
Train Loss: 0.0708
Val Loss: 0.1287, Val F1 Score: 0.9427
Validation loss decreased (0.168056 --> 0.128667). F1 score: 0.942688. Saving model ...
New best F1 score: 0.9427


Training: 100%|██████████| 40/40 [00:15<00:00,  2.63it/s]
Evaluating: 100%|██████████| 10/10 [00:01<00:00,  8.16it/s]


Epoch 8/50
Train Loss: 0.0458
Val Loss: 0.1461, Val F1 Score: 0.9408
EarlyStopping counter: 1 out of 7


Training: 100%|██████████| 40/40 [00:15<00:00,  2.63it/s]
Evaluating: 100%|██████████| 10/10 [00:01<00:00,  8.07it/s]


Epoch 9/50
Train Loss: 0.0250
Val Loss: 0.1553, Val F1 Score: 0.9355
EarlyStopping counter: 2 out of 7


Training: 100%|██████████| 40/40 [00:15<00:00,  2.63it/s]
Evaluating: 100%|██████████| 10/10 [00:01<00:00,  7.87it/s]


Epoch 10/50
Train Loss: 0.0182
Val Loss: 0.1593, Val F1 Score: 0.9435
EarlyStopping counter: 3 out of 7
New best F1 score: 0.9435


Training: 100%|██████████| 40/40 [00:15<00:00,  2.63it/s]
Evaluating: 100%|██████████| 10/10 [00:01<00:00,  7.84it/s]


Epoch 11/50
Train Loss: 0.0140
Val Loss: 0.1450, Val F1 Score: 0.9450
EarlyStopping counter: 4 out of 7
New best F1 score: 0.9450


Training: 100%|██████████| 40/40 [00:15<00:00,  2.63it/s]
Evaluating: 100%|██████████| 10/10 [00:01<00:00,  7.81it/s]


Epoch 12/50
Train Loss: 0.0114
Val Loss: 0.1523, Val F1 Score: 0.9394
EarlyStopping counter: 5 out of 7


Training: 100%|██████████| 40/40 [00:15<00:00,  2.63it/s]
Evaluating: 100%|██████████| 10/10 [00:01<00:00,  8.09it/s]


Epoch 13/50
Train Loss: 0.0097
Val Loss: 0.1537, Val F1 Score: 0.9568
EarlyStopping counter: 6 out of 7
New best F1 score: 0.9568


Training: 100%|██████████| 40/40 [00:15<00:00,  2.63it/s]
Evaluating: 100%|██████████| 10/10 [00:01<00:00,  8.11it/s]


Epoch 14/50
Train Loss: 0.0088
Val Loss: 0.1605, Val F1 Score: 0.9351
EarlyStopping counter: 7 out of 7
Early stopping. Best validation loss: 0.128667, Best F1 score: 0.942688
Training completed.


In [37]:
# PaddleOCR로 추출한 테스트 데이터의 텍스트 데이터 로드
csv_path = './test_texts.csv'
csv_text_data = load_text_from_csv(csv_path)

# 테스트 데이터 로드
test_df = pd.read_csv(f"{data_path}/sample_submission.csv")

# 토크나이저 및 변환 준비
tokenizer = AutoTokenizer.from_pretrained('klue/bert-base')
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485], std=[0.229])  # 그레이스케일 이미지에 맞는 값 사용
])

# 테스트 데이터셋 및 데이터로더 준비
test_dataset = MultimodalDataset(test_df, f"{data_path}/test_preprocessed", csv_text_data, transform, tokenizer)

BATCH_SIZE = 32
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, persistent_workers=True, pin_memory=True)

# 모델 준비
num_classes = len(pd.read_csv(f"{data_path}/train_correct_labeling.csv")['target'].unique())
model = MultimodalModel(num_classes).to(device)

# 저장된 모델 불러오기
model.load_state_dict(torch.load("best_model.pth"))
model.eval()

print("Model loaded. Starting test data prediction...")

# 테스트 데이터 추론
test_predictions = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting test data"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        images = batch['image'].to(device)
        
        outputs = model(images, input_ids, attention_mask)
        _, predicted = torch.max(outputs, 1)
        test_predictions.extend(predicted.cpu().numpy())

# 결과 저장
submission_df = pd.DataFrame({'ID': test_df['ID'], 'target': test_predictions})
submission_df.to_csv("multimodal_pred.csv", index=False)
print("Test predictions saved to multimodal_pred.csv")

  model.load_state_dict(torch.load("best_model.pth"))
100%|██████████| 197/197 [01:48<00:00,  1.82it/s]

Prediction completed and saved to pred.csv



