In [1]:
!pip install -q kaggle

# Upload your Kaggle API key
from google.colab import files
files.upload()  # Select kaggle.json when prompted

# Set up Kaggle API key
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download the dataset
!kaggle datasets download -d fournierp/captcha-version-2-images
!unzip -q captcha-version-2-images.zip -d captcha_dataset


Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/fournierp/captcha-version-2-images
License(s): other


In [2]:
from transformers import ViTImageProcessor, ViTForImageClassification

# Load Vision Transformer model & processor
processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
vit_model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

In [3]:
pip install pytesseract

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision.datasets import ImageFolder
import pytesseract
from transformers import ViTForImageClassification, ViTImageProcessor
from PIL import Image
import numpy as np

# Define image transformations
from torchvision import transforms

transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=3),  # Convert to RGB (ViT needs 3 channels)
    transforms.Resize((224, 224)),  # Resize for ViT
    transforms.ToTensor(),  # Convert to tensor (values in [0,1])
])

# Load dataset
dataset_path = "/content/captcha_dataset"
dataset = ImageFolder(root=dataset_path, transform=transform)

# Split dataset into training (80%) and validation (20%)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


In [5]:
def extract_text_ocr(image_path):
    """
    Extract text from CAPTCHA using Tesseract OCR.
    """
    image = Image.open(image_path).convert("RGB")
    text = pytesseract.image_to_string(image, config="--psm 6")  # Page Segmentation Mode 6
    return text.strip()


In [6]:
char_map = "abcdefghijklmnopqrstuvwxyz0123456789"

In [7]:
def predict_cnn(image_path):
    """Predict CAPTCHA using the CNN model."""
    image = Image.open(image_path).convert("RGB")
    image = transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        cnn_pred = cnn_model(image)  # Get logits
        cnn_pred = torch.softmax(cnn_pred, dim=1)  # Apply softmax
        cnn_pred_indices = cnn_pred.argmax(dim=1).cpu().numpy()  # Get predicted character indices
        cnn_text = "".join([char_map[i] for i in cnn_pred_indices])  # Map indices to characters

    return cnn_text.strip()

In [8]:
def extract_text_vit(image_path):
    """Use OCR for ViT-based text extraction."""
    return extract_text_ocr(image_path)

In [9]:
class CNN_CAPTCHA(nn.Module):
    def __init__(self, num_classes):
        super(CNN_CAPTCHA, self).__init__()
        self.cnn = models.resnet18(pretrained=True)
        self.cnn.fc = nn.Linear(self.cnn.fc.in_features, num_classes)

    def forward(self, x):
        return self.cnn(x)


In [10]:
# Define models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cnn_model = CNN_CAPTCHA(num_classes=36).to(device)  # 26 letters + 10 digits
vit_model.to(device)

# Define optimizer & loss function
criterion = nn.CrossEntropyLoss()
optimizer_cnn = optim.AdamW(cnn_model.parameters(), lr=2e-5)
optimizer_vit = optim.AdamW(vit_model.parameters(), lr=2e-5)

# Training loop
for epoch in range(5):  # Change as needed
    cnn_model.train()
    vit_model.train()

    cnn_correct, cnn_total, vit_correct, vit_total = 0, 0, 0, 0
    cnn_total_loss, vit_total_loss = 0, 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        # CNN forward pass
        optimizer_cnn.zero_grad()
        cnn_outputs = cnn_model(images)
        cnn_loss = criterion(cnn_outputs, labels)
        cnn_loss.backward()
        optimizer_cnn.step()

        cnn_total_loss += cnn_loss.item()
        cnn_correct += (cnn_outputs.argmax(1) == labels).sum().item()
        cnn_total += labels.size(0)

        # ViT forward pass
        inputs = processor(images=[img.permute(1, 2, 0).cpu().numpy() for img in images], return_tensors="pt", do_rescale=False)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        optimizer_vit.zero_grad()
        vit_outputs = vit_model(**inputs)
        vit_loss = criterion(vit_outputs.logits, labels)
        vit_loss.backward()
        optimizer_vit.step()

        vit_total_loss += vit_loss.item()
        vit_correct += (vit_outputs.logits.argmax(1) == labels).sum().item()
        vit_total += labels.size(0)

    # Calculate accuracies
    cnn_acc = cnn_correct / cnn_total * 100
    vit_acc = vit_correct / vit_total * 100

    print(f"Epoch {epoch+1}:")
    print(f"  CNN  -> Loss: {cnn_total_loss / len(train_loader):.4f}, Accuracy: {cnn_acc:.2f}%")
    print(f"  ViT  -> Loss: {vit_total_loss / len(train_loader):.4f}, Accuracy: {vit_acc:.2f}%")

# Save models
torch.save(cnn_model.state_dict(), "cnn_captcha.pth")
torch.save(vit_model.state_dict(), "vit_captcha.pth")

print("Training complete. Models saved.")


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 183MB/s]


Epoch 1:
  CNN  -> Loss: 2.9588, Accuracy: 28.39%
  ViT  -> Loss: 0.8091, Accuracy: 89.43%
Epoch 2:
  CNN  -> Loss: 1.0747, Accuracy: 99.47%
  ViT  -> Loss: 0.0006, Accuracy: 100.00%
Epoch 3:
  CNN  -> Loss: 0.3067, Accuracy: 100.00%
  ViT  -> Loss: 0.0003, Accuracy: 100.00%
Epoch 4:
  CNN  -> Loss: 0.1490, Accuracy: 100.00%
  ViT  -> Loss: 0.0002, Accuracy: 100.00%
Epoch 5:
  CNN  -> Loss: 0.0960, Accuracy: 100.00%
  ViT  -> Loss: 0.0002, Accuracy: 100.00%
Training complete. Models saved.


In [11]:
def predict_captcha(image_path):
    """Predict CAPTCHA using CNN, ViT, and OCR as a fallback."""
    cnn_text = predict_cnn(image_path)
    vit_text = extract_text_vit(image_path)
    ocr_text = extract_text_ocr(image_path)

    # Return the first available valid result
    if cnn_text==vit_text:
        return cnn_text
    else:
        return vit_text
    if cnn_text=="" and vit_text=="":
        return ocr_text

In [21]:
import os

pat = '/content/captcha_dataset/samples/samples'
for image in os.listdir(pat):
    image_path = os.path.join(pat, image)
    prediction = predict_captcha(image_path)
    if image.split(".")[0]==prediction:
        print(f"Predicted CAPTCHA for {image}: {prediction}")

Predicted CAPTCHA for 34b84.png: 34b84
Predicted CAPTCHA for 76nxn.png: 76nxn
Predicted CAPTCHA for 87d4c.png: 87d4c


In [14]:
from flask import Flask, request, render_template_string
from flask_ngrok import run_with_ngrok
import torch
import torchvision.transforms as transforms
from PIL import Image
from transformers import ViTImageProcessor, ViTForImageClassification
import pytesseract
import os
from google.colab import files

# Set Tesseract Path (Linux)
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"

# Initialize Flask App
app = Flask(__name__)
run_with_ngrok(app)  # Enable ngrok for Colab

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Simple HTML Interface
HTML_TEMPLATE = """
<!DOCTYPE html>
<html>
<head>
    <title>CAPTCHA Solver</title>
</head>
<body>
    <h1>Upload CAPTCHA Image</h1>
    <form method="POST" enctype="multipart/form-data">
        <input type="file" name="captcha">
        <input type="submit" value="Predict">
    </form>
    {% if prediction %}
        <h2>Predicted CAPTCHA: {{ prediction }}</h2>
    {% endif %}
</body>
</html>
"""

@app.route("/", methods=["GET", "POST"])
def home():
    prediction = None
    if request.method == "POST":
        image = request.files["captcha"]
        image_path = "uploaded_captcha.png"
        image.save(image_path)
        prediction = predict_captcha(image_path)
    return render_template_string(HTML_TEMPLATE, prediction=prediction)

if __name__ == "__main__":
    app.run()


ModuleNotFoundError: No module named 'flask_ngrok'