In [None]:
# Cell 1: Setup & Environment Installation
!pip install opendatasets torch torchvision pandas matplotlib scikit-learn --quiet

import os
import glob
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import cv2
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

print("Setup Complete. Libraries are ready.")


In [None]:
# Cell 2: Data Acquisition & Preparation
import opendatasets as od

# ดาวน์โหลดข้อมูล CAPTCHA
dataset_url = 'https://www.kaggle.com/datasets/fournierp/captcha-version-2-images'
od.download(dataset_url)

DATA_DIR = './captcha-version-2-images/samples/'

# สร้าง DataFrame จากชื่อไฟล์
image_files = glob.glob(os.path.join(DATA_DIR, '*.png'))
labels = [os.path.splitext(os.path.basename(f))[0] for f in image_files]

df = pd.DataFrame({'image_path': image_files, 'label': labels})

print(f"Found {len(df)} images.")
df.head()


In [None]:
# Cell 3: Parameters & Character Encoding

# --- Parameters ---
IMG_HEIGHT = 50
IMG_WIDTH = 200
MAX_LENGTH = 5 # ความยาวสูงสุดของข้อความใน CAPTCHA
BATCH_SIZE = 64
EPOCHS = 20 # โจทย์จริงอาจต้องใช้มากกว่านี้
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# --- Character Encoding ---
# หาชุดตัวอักษรทั้งหมดที่มีในข้อมูล
all_chars = sorted(list(set("".join(df['label']))))
# เพิ่ม 'blank' token สำหรับ CTC Loss
# token นี้จะอยู่ที่ index 0
char_to_int = {char: i + 1 for i, char in enumerate(all_chars)}
char_to_int['-'] = 0 # CTC blank token
int_to_char = {i: char for char, i in char_to_int.items()}

NUM_CLASSES = len(int_to_char)

print(f"Using device: {DEVICE}")
print(f"Number of classes (including blank token): {NUM_CLASSES}")
print("Character set:", "".join(all_chars))


In [None]:
# Cell 4: Custom PyTorch Dataset for OCR

class CaptchaDataset(Dataset):
    def __init__(self, df, char_map, max_length):
        self.df = df
        self.char_map = char_map
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = row['image_path']
        label = row['label']

        # --- Image Processing ---
        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        image = cv2.resize(image, (IMG_WIDTH, IMG_HEIGHT))
        image = image / 255.0  # Normalize to [0, 1]
        image = torch.tensor(image, dtype=torch.float32).unsqueeze(0) # (1, H, W)

        # --- Label Encoding ---
        encoded_label = [self.char_map[char] for char in label]
        label_tensor = torch.tensor(encoded_label, dtype=torch.long)
        label_length = torch.tensor(len(label), dtype=torch.long)
        
        return image, label_tensor, label_length

# แบ่งข้อมูล
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# สร้าง Datasets
train_dataset = CaptchaDataset(train_df, char_to_int, MAX_LENGTH)
val_dataset = CaptchaDataset(val_df, char_to_int, MAX_LENGTH)

print("Datasets created.")


In [None]:
# Cell 5: The CRNN Model Architecture

class CRNN(nn.Module):
    def __init__(self, num_classes):
        super(CRNN, self).__init__()
        
        # --- Part 1: Convolutional Layers (CNN) ---
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 1)), # (H, W) -> (6, 50) -> (3, 50)
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
        )
        
        # --- Part 2: Recurrent Layers (RNN) ---
        # ต้องคำนวณขนาด Feature ที่ออกจาก CNN ก่อน
        # (Batch, C, H, W) -> (Batch, 512, 1, 50)
        self.rnn = nn.LSTM(
            input_size=512, # ขนาด feature จาก CNN
            hidden_size=256,
            num_layers=2,
            bidirectional=True, # ใช้ LSTM ทั้งไปข้างหน้าและย้อนกลับ
            batch_first=True
        )
        
        # --- Part 3: Classifier ---
        self.classifier = nn.Linear(
            in_features=512, # 256 (hidden) * 2 (bidirectional)
            out_features=num_classes
        )

    def forward(self, x):
        # Pass through CNN
        features = self.cnn(x) # -> (B, C, H, W) e.g., (B, 512, 1, 50)
        
        # Reshape for RNN
        features = features.squeeze(2) # -> (B, C, W) e.g., (B, 512, 50)
        features = features.permute(0, 2, 1) # -> (B, W, C) e.g., (B, 50, 512)
        
        # Pass through RNN
        rnn_output, _ = self.rnn(features) # -> (B, SeqLen, Features) e.g., (B, 50, 512)
        
        # Pass through Classifier
        output = self.classifier(rnn_output) # -> (B, SeqLen, NumClasses) e.g., (B, 50, 20)
        
        # Reshape for CTC Loss
        output = output.permute(1, 0, 2) # -> (SeqLen, B, NumClasses) REQUIRED BY CTC
        
        return output



In [None]:
# Cell 6: Collate Function, DataLoaders, and Initialization

def collate_fn(batch):
    images, labels, label_lengths = zip(*batch)
    images = torch.stack(images, 0)
    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=0)
    label_lengths = torch.stack(label_lengths, 0)
    return images, labels, label_lengths

# สร้าง DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# --- Initialization ---
model = CRNN(num_classes=NUM_CLASSES).to(DEVICE)
loss_fn = nn.CTCLoss(blank=0, zero_infinity=True) # blank=0 คือ index ของ blank token
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

print("Model, Loss, and Optimizer are ready.")


In [None]:
# Cell 7: CTC Decode Function

def ctc_decode(preds, int_to_char_map):
    # preds shape: (SeqLen, Batch, NumClasses)
    preds = preds.permute(1, 0, 2) # -> (Batch, SeqLen, NumClasses)
    pred_indices = torch.argmax(preds, dim=2) # -> (Batch, SeqLen)
    
    decoded_texts = []
    for indices in pred_indices:
        text = []
        last_char_idx = -1
        for idx in indices:
            # ไม่เอาตัวซ้ำและ blank token
            if idx.item() != last_char_idx and idx.item() != 0:
                text.append(int_to_char_map[idx.item()])
            last_char_idx = idx.item()
        decoded_texts.append("".join(text))
        
    return decoded_texts



In [None]:
# Cell 8: The Training Loop

for epoch in range(EPOCHS):
    print(f"\n--- Epoch {epoch+1}/{EPOCHS} ---")
    
    # --- Training Phase ---
    model.train()
    train_loss = 0
    for images, labels, label_lengths in tqdm(train_loader, desc="Training"):
        images = images.to(DEVICE)
        labels = labels.to(DEVICE)
        
        optimizer.zero_grad()
        
        preds = model(images) # -> (SeqLen, Batch, NumClasses)
        
        # Prepare for CTC Loss
        pred_lengths = torch.full(
            size=(preds.size(1),), 
            fill_value=preds.size(0), 
            dtype=torch.long
        ).to(DEVICE)
        
        loss = loss_fn(preds.log_softmax(2), labels, pred_lengths, label_lengths)
        
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()

    # --- Validation Phase ---
    model.eval()
    val_correct = 0
    with torch.no_grad():
        for images, labels, label_lengths in tqdm(val_loader, desc="Validating"):
            images = images.to(DEVICE)
            preds = model(images)
            
            # Decode predictions and ground truth
            decoded_preds = ctc_decode(preds, int_to_char)
            
            # ต้องถอดรหัส label กลับไปเป็น text เพื่อเทียบกัน
            # เพราะ CTC Loss ไม่ได้คำนวณ accuracy ตรงๆ
            true_labels = []
            for l in labels:
                true_labels.append("".join([int_to_char[i.item()] for i in l if i != 0]))
            
            for pred, true in zip(decoded_preds, true_labels):
                if pred == true:
                    val_correct += 1

    avg_train_loss = train_loss / len(train_loader)
    val_acc = val_correct / len(val_dataset)
    
    print(f"Train Loss: {avg_train_loss:.4f} | Validation Accuracy: {val_acc:.4f}")

print("\nFinished Training!")


In [None]:
# Cell 9: Visualize Predictions & Generate Submission

model.eval()

# แสดงผล 5 ภาพตัวอย่าง
for i in range(5):
    # สุ่มภาพจาก validation set
    idx = np.random.randint(len(val_dataset))
    image, label_tensor, _ = val_dataset[idx]
    true_label = "".join([int_to_char[i.item()] for i in label_tensor])

    with torch.no_grad():
        pred = model(image.unsqueeze(0).to(DEVICE))
        decoded_pred = ctc_decode(pred, int_to_char)[0]

    plt.figure(figsize=(6, 2))
    plt.imshow(image.squeeze().cpu(), cmap='gray')
    plt.title(f"True: {true_label} | Pred: {decoded_pred}")
    plt.axis('off')
    plt.show()

# --- สร้างไฟล์ Submission (จำลองโดยใช้ valal_df) ---
submission_df = val_df.copy()
all_preds = []
test_loader_sub = DataLoader(val_dataset, batchize=BATCH_SIZE, shuffle=False)

with torch.no_grad():
    for images, _, _ in tqdm(test_loader_sub, desc="Generating Submission"):
        images = images.to(DEVICE)
        preds = model(images)
        decoded_preds = ctc_decode(preds, int_to_char)
        all_preds.extend(decoded_preds)

submission_df['predicted_label'] = all_preds
submission_df = submission_df[['image_path', 'label', 'predicted_label']]
submission_df['image_path'] = submission_df['image_path'].apply(os.path.basename)

submission_df.to_csv('submission.csv', index=False)
print("\nsubmission.csv created successfully!")
display(submission_df.head())
