In [3]:
!pip install opencv-python

Collecting opencv-python
  Downloading opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl.metadata (20 kB)
Downloading opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl (39.5 MB)
   ---------------------------------------- 0.0/39.5 MB ? eta -:--:--
    --------------------------------------- 0.5/39.5 MB 3.4 MB/s eta 0:00:12
   - -------------------------------------- 1.8/39.5 MB 5.6 MB/s eta 0:00:07
   -- ------------------------------------- 2.6/39.5 MB 5.6 MB/s eta 0:00:07
   --- ------------------------------------ 3.4/39.5 MB 4.7 MB/s eta 0:00:08
   --- ------------------------------------ 3.9/39.5 MB 4.0 MB/s eta 0:00:09
   ---- ----------------------------------- 4.2/39.5 MB 3.6 MB/s eta 0:00:10
   ---- ----------------------------------- 4.5/39.5 MB 3.1 MB/s eta 0:00:12
   ---- ----------------------------------- 4.5/39.5 MB 3.1 MB/s eta 0:00:12
   ---- ----------------------------------- 4.7/39.5 MB 2.7 MB/s eta 0:00:13
   ----- ---------------------------------- 5.0/39.5 MB 2.4 MB/


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import cv2
import os
import numpy as np

# Hyperparameters
IMG_HEIGHT, IMG_WIDTH = 300, 300
BATCH_SIZE = 16
EPOCHS = 10
LEARNING_RATE = 1e-4
MAX_TEXT_LENGTH = 10  # Adjust based on dataset

# Character set
CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
CHAR2IDX = {c: i + 1 for i, c in enumerate(CHARS)}  # Leave 0 for blank token
IDX2CHAR = {i: c for c, i in CHAR2IDX.items()}

# Dataset
class CaptchaDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.data = []
        for word in os.listdir(root_dir):
            word_path = os.path.join(root_dir, word)
            easy = 0
            hard_hollow = 0
            hard_normal = 0
            green = 0
            red = 0
            if os.path.isdir(word_path):
                for img_file in os.listdir(word_path):
                    if img_file.startswith("easy"):
                        easy += 1
                        if easy > 1:
                            continue
                    elif img_file.startswith("hardhollow"):
                        hard_hollow += 1
                        if hard_hollow > 20:
                            continue
                    elif img_file.startswith("hardnormal"):
                        hard_normal += 1
                        if hard_normal > 20:
                            continue
                    elif img_file.startswith("green"):
                        green += 1
                        if green > 20:
                            continue
                    elif img_file.startswith("red"):
                        red += 1
                        if red > 40:
                            continue
                    self.data.append((os.path.join(word_path, img_file), word))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        img_path, label = self.data[idx]
        image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        image = cv2.resize(image, (IMG_WIDTH, IMG_HEIGHT))
        image = np.expand_dims(image, axis=0) / 255.0  # Normalize
        label_encoded = [CHAR2IDX[c] for c in label]
        return torch.FloatTensor(image), torch.LongTensor(label_encoded)

# Model
class CRNN(nn.Module):
    def __init__(self, num_classes=len(CHARS) + 1):
        super(CRNN, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1), nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1), nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1), nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1), nn.ReLU(),
            nn.MaxPool2d((2, 2)),
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1), nn.ReLU(),
            nn.MaxPool2d((2, 2))
        )
        self.rnn = nn.LSTM(512 * (IMG_WIDTH // 16), 256, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.cnn(x)
        x = x.permute(0, 2, 3, 1).contiguous()
        x = x.view(x.size(0), x.size(1), -1)
        x, _ = self.rnn(x)
        x = self.fc(x)
        return x

# Loss and Training
criterion = nn.CTCLoss(blank=0, zero_infinity=True)

def collate_fn(batch):
    images, labels = zip(*batch)
    images = torch.stack(images)  # Stack images normally
    labels = pad_sequence(labels, batch_first=True, padding_value=0)  # Pad labels
    return images, labels

def train_model():
    dataset = CaptchaDataset("dataset", transform=None)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    model = CRNN().cuda()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    
    for epoch in range(EPOCHS):
        total_loss = 0
        for images, labels in dataloader:
            images, labels = images.cuda(), labels.cuda()
            
            optimizer.zero_grad()
            logits = model(images)
            
            input_lengths = torch.full((logits.size(0),), logits.size(1), dtype=torch.long).cuda()
            target_lengths = torch.tensor([len(lbl) for lbl in labels], dtype=torch.long, device=logits.device)
            if logits.size(0) > 0:
                loss = criterion(logits.log_softmax(2).permute(1, 0, 2), labels, input_lengths, target_lengths)
                loss.backward()
                optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss / len(dataloader):.4f}")
    
    torch.save(model.state_dict(), "captcha_model.pth")
    print("Model saved!")

In [31]:
train_model()

Epoch 1/10, Loss: 2.4748
Epoch 2/10, Loss: 2.0842
Epoch 3/10, Loss: 2.0067
Epoch 4/10, Loss: 1.8880
Epoch 5/10, Loss: 1.7055
Epoch 6/10, Loss: 1.4720
Epoch 7/10, Loss: 1.1944
Epoch 8/10, Loss: 0.8905
Epoch 9/10, Loss: 0.5833
Epoch 10/10, Loss: 0.3103
Model saved!


In [10]:
import torch

# Initialize the model
model = CRNN().cuda() if torch.cuda.is_available() else CRNN()
model.load_state_dict(torch.load("captcha_model.pth", map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu")))
model.eval()  # Set to evaluation mode

CRNN(
  (cnn): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU()
    (10): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): ReLU()
    (13): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  )
  (rnn): LSTM(9216, 256, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=63, bias=True)
)

In [None]:
import cv2
import numpy as np

IMG_HEIGHT, IMG_WIDTH = 300, 300  # Same as training

def preprocess_image(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)  # Read as grayscale
    image = cv2.resize(image, (IMG_WIDTH, IMG_HEIGHT))  # Resize
    image = np.expand_dims(image, axis=0) / 255.0  # Normalize and add channel dimension
    return torch.FloatTensor(image).unsqueeze(0).cuda() if torch.cuda.is_available() else torch.FloatTensor(image).unsqueeze(0)



In [32]:
import itertools
image_path = "easy_test.png"
image_tensor = preprocess_image(image_path)
with torch.no_grad():
    logits = model(image_tensor)

logits = logits.log_softmax(2) 
predicted_indices = torch.argmax(logits, dim=2).squeeze(1)
predicted_indices = predicted_indices.squeeze(0)  # Removes batch dimension, making it (18,)
predicted_text = "".join([IDX2CHAR[idx.item()] for idx in predicted_indices if idx.item() in IDX2CHAR])
print("Predicted CAPTCHA text:", predicted_text)


def clean_prediction(text):
    return "".join(ch for ch, _ in itertools.groupby(text))
final_predicted_text = clean_prediction(predicted_text)
print("Final Predicted CAPTCHA:", final_predicted_text)

Predicted CAPTCHA text: tctot
Final Predicted CAPTCHA: tctot


In [None]:
image_path = "green_test.png"
image_tensor = preprocess_image(image_path)
with torch.no_grad():
    logits = model(image_tensor)

logits = logits.log_softmax(2) 
predicted_indices = torch.argmax(logits, dim=2).squeeze(1)
predicted_indices = predicted_indices.squeeze(0)  # Removes batch dimension, making it (18,)
predicted_text = "".join([IDX2CHAR[idx.item()] for idx in predicted_indices if idx.item() in IDX2CHAR])
print("Predicted CAPTCHA text:", predicted_text)


def clean_prediction(text):
    return "".join(ch for ch, _ in itertools.groupby(text))
final_predicted_text = clean_prediction(predicted_text)
print("Final Predicted CAPTCHA:", final_predicted_text)

Predicted CAPTCHA text: tcror
Final Predicted CAPTCHA: tcror


In [None]:
image_path = "red_test.png"
image_tensor = preprocess_image(image_path)
with torch.no_grad():
    logits = model(image_tensor)

logits = logits.log_softmax(2) 
predicted_indices = torch.argmax(logits, dim=2).squeeze(1)
predicted_indices = predicted_indices.squeeze(0)  # Removes batch dimension, making it (18,)
predicted_text = "".join([IDX2CHAR[idx.item()] for idx in predicted_indices if idx.item() in IDX2CHAR])
print("Predicted CAPTCHA text:", predicted_text)


def clean_prediction(text):
    return "".join(ch for ch, _ in itertools.groupby(text))
final_predicted_text = clean_prediction(predicted_text)
print("Final Predicted CAPTCHA:", final_predicted_text)

Predicted CAPTCHA text: aerol
Final Predicted CAPTCHA: aerol


In [None]:
image_path = "easy_test.png"
image_tensor = preprocess_image(image_path)
with torch.no_grad():
    logits = model(image_tensor)

logits = logits.log_softmax(2) 
predicted_indices = torch.argmax(logits, dim=2).squeeze(1)
predicted_indices = predicted_indices.squeeze(0)  # Removes batch dimension, making it (18,)
predicted_text = "".join([IDX2CHAR[idx.item()] for idx in predicted_indices if idx.item() in IDX2CHAR])
print("Predicted CAPTCHA text:", predicted_text)


def clean_prediction(text):
    return "".join(ch for ch, _ in itertools.groupby(text))
final_predicted_text = clean_prediction(predicted_text)
print("Final Predicted CAPTCHA:", final_predicted_text)

Predicted CAPTCHA text: tctot
Final Predicted CAPTCHA: tctot


In [36]:
image_path = "hardhollow_test.png"
image_tensor = preprocess_image(image_path)
with torch.no_grad():
    logits = model(image_tensor)

logits = logits.log_softmax(2) 
predicted_indices = torch.argmax(logits, dim=2).squeeze(1)
predicted_indices = predicted_indices.squeeze(0)  # Removes batch dimension, making it (18,)
predicted_text = "".join([IDX2CHAR[idx.item()] for idx in predicted_indices if idx.item() in IDX2CHAR])
print("Predicted CAPTCHA text:", predicted_text)


def clean_prediction(text):
    return "".join(ch for ch, _ in itertools.groupby(text))
final_predicted_text = clean_prediction(predicted_text)
print("Final Predicted CAPTCHA:", final_predicted_text)

Predicted CAPTCHA text: scoor
Final Predicted CAPTCHA: scor


In [37]:
image_path = "hardnormal_test.png"
image_tensor = preprocess_image(image_path)
with torch.no_grad():
    logits = model(image_tensor)

logits = logits.log_softmax(2) 
predicted_indices = torch.argmax(logits, dim=2).squeeze(1)
predicted_indices = predicted_indices.squeeze(0)  # Removes batch dimension, making it (18,)
predicted_text = "".join([IDX2CHAR[idx.item()] for idx in predicted_indices if idx.item() in IDX2CHAR])
print("Predicted CAPTCHA text:", predicted_text)


def clean_prediction(text):
    return "".join(ch for ch, _ in itertools.groupby(text))
final_predicted_text = clean_prediction(predicted_text)
print("Final Predicted CAPTCHA:", final_predicted_text)

Predicted CAPTCHA text: totot
Final Predicted CAPTCHA: totot
