### AI 허브에서 차량 번호판 데이터를 받아와서 구현
### Training데이터에서 원천데이터와 라벨링데이터를 다운로드 받아서 사용하기

In [1]:
!pip install torch torchvision torchaudio matplotlib Pillow scikit-learn paddlepaddle paddleocr

Collecting torchaudio
  Downloading torchaudio-2.4.1-cp310-cp310-win_amd64.whl.metadata (6.5 kB)
INFO: pip is looking at multiple versions of torchaudio to determine which version is compatible with other requirements. This could take a while.
  Downloading torchaudio-2.4.0-cp310-cp310-win_amd64.whl.metadata (6.4 kB)
  Downloading torchaudio-2.3.1-cp310-cp310-win_amd64.whl.metadata (6.4 kB)
Downloading torchaudio-2.3.1-cp310-cp310-win_amd64.whl (2.4 MB)
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   ----------------- ---------------------- 1.0/2.4 MB 4.6 MB/s eta 0:00:01
   ---------------------------------------  2.4/2.4 MB 5.2 MB/s eta 0:00:01
   ---------------------------------------- 2.4/2.4 MB 5.0 MB/s eta 0:00:00
Installing collected packages: torchaudio
Successfully installed torchaudio-2.3.1


In [2]:
import os
import json
import torch
import random
import matplotlib.pyplot as plt
import torch.nn as nn
import torchvision.transforms as transforms
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from paddleocr import PaddleOCR, draw_ocr

In [3]:
# PaddleOCR 초기화
ocr = PaddleOCR(use_angle_cls=True, lang='korean')

[2024/09/23 21:22:53] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\jjm98/.paddleocr/whl\\det\\ml\\Multilingual_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\jjm98/.paddleocr/whl\\rec\\korean\\korean_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 32

In [6]:
img_dir = 'C:/Users/jjm98/OneDrive/바탕 화면/Task/train'
label_dir = 'C:/Users/jjm98/OneDrive/바탕 화면/Task/valid'

In [7]:
imgs = [f for f in os.listdir(img_dir) if f.endswith('.jpg')]
labels = [f for f in os.listdir(label_dir) if f.endswith('.json')]

In [8]:
train_imgs, val_test_imgs, train_labels, val_test_labels = train_test_split(imgs, labels, test_size=0.2, random_state=2024)
val_imgs, test_imgs, val_labels, test_labels = train_test_split(val_test_imgs, val_test_labels, test_size=0.5, random_state=2024)

In [9]:
print(f"Train set: {len(train_imgs)} imgs")
print(f"Validation set: {len(val_imgs)} imgs")
print(f"Test set: {len(test_imgs)} imgs")

Train set: 58876 imgs
Validation set: 7360 imgs
Test set: 7360 imgs


In [10]:
def owt(img_path):
    img = Image.open(img_path)
    text = ''.join([line[1][0] for line in result[0]])
    return text.strip()

In [11]:
class NPD(Dataset):
    def __init__(self, img_dir, label_dir, img_files, label_files, transform=None):
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.img_files = img_files
        self.label_files = label_files
        self.transform = transform

    def __len__(self):
        return len(self.img_files)

    def __getitem__(self, idx):
        img_name = self.img_files[idx]
        label_name = self.label_files[idx]

        img_path = os.path.join(self.img_dir, img_name)
        label_path = os.path.join(self.label_dir, label_name)

        img = Image.open(img_path).convert('L')
        with open(label_path, 'r', encoding='utf-8') as f:
            label_data = json.load(f)
            label = label_data['value']

        if self.transform:
            img = self.transform(img)

        return img, label

transform = transforms.Compose([
    transforms.Resize((32, 100)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

In [12]:
train_dataset = NPD(img_dir, label_dir, train_imgs, train_labels, transform=transform)
val_dataset = NPD(img_dir, label_dir, val_imgs, val_labels, transform=transform)
test_dataset = NPD(img_dir, label_dir, test_imgs, test_labels, transform=transform)

In [13]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [14]:
class CRNN(nn.Module):
    def __init__(self, imgH, nc, nclass, nh):
        super(CRNN, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(nc, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, (2, 1)),
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, (2, 1)),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, (2, 1)),
            nn.MaxPool2d(2, 1)
        )
        self.rnn = nn.LSTM(256, nh, bidirectional=True)
        self.fc = nn.Linear(nh * 2, nclass)

    def forward(self, x):
        conv = self.cnn(x)
        b, c, h, w = conv.size()
        assert h == 1, "the height of conv must be 1"
        conv = conv.squeeze(2)
        conv = conv.permute(2, 0, 1)
        output, _ = self.rnn(conv)
        output = self.fc(output)
        return output

In [15]:
nc = 1
nh = 256
nclass = len("0123456789가나다라마바사아자차카타파하") + 1

In [16]:
model = CRNN(imgH=32, nc=nc, nclass=nclass, nh=nh)

In [17]:
# 가중치 초기화 추가
def weights_init(m):
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
    if isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if 'weight' in name:
                torch.nn.init.xavier_uniform_(param)
model.apply(weights_init)

# def weights_init(m):
#     if isinstance(m, nn.Conv2d):
#         nn.init.kaiming_normal_(m.weight)
#         if m.bias is not None:
#             nn.init.constant_(m.bias, 0)
#     elif isinstance(m, nn.Linear):
#         nn.init.xavier_normal_(m.weight)
#         if m.bias is not None:
#             nn.init.constant_(m.bias, 0)

CRNN(
  (cnn): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): MaxPool2d(kernel_size=2, stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
    (9): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (10): ReLU()
    (11): MaxPool2d(kernel_size=2, stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
    (12): MaxPool2d(kernel_size=2, stride=1, padding=0, dilation=1, ceil_mode=False)
  )
  (rnn): LSTM(256, 256, bidirectional=True)
  (fc): Linear(in_features=512, out_features=25, bias=True)
)

In [18]:
model.apply(weights_init)

CRNN(
  (cnn): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): MaxPool2d(kernel_size=2, stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
    (9): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (10): ReLU()
    (11): MaxPool2d(kernel_size=2, stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
    (12): MaxPool2d(kernel_size=2, stride=1, padding=0, dilation=1, ceil_mode=False)
  )
  (rnn): LSTM(256, 256, bidirectional=True)
  (fc): Linear(in_features=512, out_features=25, bias=True)
)

In [19]:
criterion = nn.CTCLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)

In [20]:
epochs = 1
for epoch in range(epochs):
    model.train()
    for imgs, labels in train_loader:
        if torch.isnan(imgs).any() or torch.isinf(imgs).any():
            print(f"NaN or Inf detected in batch! Skipping this batch.")
            continue  # 문제가 있는 배치는 건너뜀
        
        optimizer.zero_grad()
        outputs = model(imgs)

        input_lengths = torch.full(size=(outputs.size(1),), fill_value=outputs.size(0), dtype=torch.long)
        target_lengths = torch.tensor([len(label) for label in labels])

        targets = torch.tensor([ord(char) - ord('0') for label in labels for char in label])

        loss = criterion(outputs, targets, input_lengths, target_lengths)
        loss.backward()

        # Gradient Clipping 추가
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

Epoch 1/1, Loss: nan


In [21]:
torch.save(model.state_dict(), 'crnn_model.pth')

In [22]:
model.load_state_dict(torch.load('crnn_model.pth'))
model.eval()

CRNN(
  (cnn): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): MaxPool2d(kernel_size=2, stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
    (9): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (10): ReLU()
    (11): MaxPool2d(kernel_size=2, stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
    (12): MaxPool2d(kernel_size=2, stride=1, padding=0, dilation=1, ceil_mode=False)
  )
  (rnn): LSTM(256, 256, bidirectional=True)
  (fc): Linear(in_features=512, out_features=25, bias=True)
)

In [23]:
def vp(img, label, prediction):
    plt.imshow(img.squeeze(), cmap='gray')
    plt.title(f'True: {label}, Predicted: {prediction}')
    plt.show()

In [None]:
for imgs, labels in test_loader:
    
    with torch.no_grad():
        outputs = model(imgs)
        predictions = []
        for output in outputs:
            prediction = ""
            for char_tensor in output:
                # 각 char_tensor에서 가장 큰 값의 인덱스를 사용
                char_value = char_tensor.argmax(dim=-1).item()
                prediction += chr(char_value)
            predictions.append(prediction)
        
        for i in range(len(imgs)):
            vp(imgs[i], labels[i], predictions[i])