<a href="https://colab.research.google.com/github/PrajwalChopade/Captcha_Dataset/blob/main/model_captcha.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from google.colab import drive
drive.mount('/content/drive')

# ======================================================
# 2️⃣ Clone your public GitHub repository
# ======================================================
!git clone https://github.com/PrajwalChopade/Captcha_Dataset.git

# ======================================================
# 3️⃣ Move into the dataset directory
# ======================================================
%cd /content/Captcha_Dataset

# Check that all folders (Digits, Lowercase, etc.) are present
!ls

MessageError: Error: credential propagation was unsuccessful

In [2]:
# Mount your Google Drive (optional if dataset is on Drive)
from google.colab import drive
drive.mount('/content/drive')

# Change to your dataset directory
%cd https://drive.google.com/drive/folders/1pbF7Tk_RdrBJwgEyBJjtSCyGBhXXIT3V?usp=drive_link

%cd /content/drive/MyDrive/captcha_dataset/CAPTCHACHAOS
# # Install required packages
# !pip install timm easyocr pytesseract opencv-python-headless scikit-learn pandas tqdm

# # Install tesseract OCR binary
# !apt install tesseract-ocr -y

MessageError: Error: credential propagation was unsuccessful

In [None]:
import os
import cv2
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
import timm

import pytesseract
import easyocr

from sklearn.model_selection import train_test_split

# Check TPU / fallback to GPU or CPU
try:
    import torch_xla
    import torch_xla.core.xla_model as xm
    DEVICE = xm.xla_device()
    print("Using TPU:", DEVICE)
except:
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using device:", DEVICE)


Using device: cpu


In [None]:
# Define Dataset Loader (for your directory structure)
class CaptchaCharDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.samples = []
        self.transform = transform
        subdirs = ["Digits", "Lowercase Alphabets", "Uppercase Alphabets", "Special Characters"]
        for sub in subdirs:
            sub_path = os.path.join(root_dir, sub)
            if not os.path.exists(sub_path):
                continue
            for label_folder in os.listdir(sub_path):
                label_path = os.path.join(sub_path, label_folder)
                if not os.path.isdir(label_path): continue
                for img_name in os.listdir(label_path):
                    if img_name.lower().endswith((".png", ".jpg", ".jpeg")):
                        self.samples.append((os.path.join(label_path, img_name), label_folder))
        print(f"✅ Loaded {len(self.samples)} images across {len(set([s[1] for s in self.samples]))} classes")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, label = self.samples[idx]
        image = Image.open(path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, label, path


transform = T.Compose([
    T.Resize((64, 64)),
    T.ToTensor(),
    T.Normalize([0.5]*3, [0.5]*3)
])


In [None]:
# STEP 4: OCR Ensemble + Difficulty Scoring Functions
# Initialize EasyOCR once
reader = easyocr.Reader(['en'], gpu=True)

def ocr_predict(path):
    """Run OCR ensemble (Tesseract + EasyOCR)"""
    img = Image.open(path).convert('RGB')
    np_img = np.array(img)

    tess = pytesseract.image_to_string(img, config='--psm 10').strip()
    easy = reader.readtext(np_img, detail=0)
    easy = easy[0].strip() if len(easy) > 0 else ""

    preds = [tess[0] if tess else "", easy[0] if easy else ""]
    return preds

def compute_difficulty(preds, gt):
    """Compute difficulty = 1 - success_rate"""
    correct = sum([p == gt for p in preds if p])
    total = len([p for p in preds if p])
    return 1 - (correct / total) if total > 0 else 1




In [None]:
# STEP 5: Feature Extractor + MLP Model
def get_feature_extractor(model_name="resnet18"):
    model = timm.create_model(model_name, pretrained=True, num_classes=0, global_pool="avg")
    model.eval().to(DEVICE)
    return model

class DifficultyRegressor(nn.Module):
    def __init__(self, input_dim, hidden=[512, 128]):
        super().__init__()
        layers = []
        d = input_dim
        for h in hidden:
            layers += [nn.Linear(d, h), nn.ReLU(), nn.Dropout(0.3)]
            d = h
        layers += [nn.Linear(d, 1), nn.Sigmoid()]  # output [0,1]
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x).squeeze(-1)


In [None]:
def run_phase2(data_root, out_dir, threshold=0.8):
    os.makedirs(out_dir, exist_ok=True)
    dataset = CaptchaCharDataset(data_root, transform)
    loader = DataLoader(dataset, batch_size=32, shuffle=False)

    print("🔹 Step 1: Running OCR ensemble and computing difficulty scores...")
    records = []
    for _, label, path in tqdm(loader):
        for pth, lbl in zip(path, label):
            preds = ocr_predict(pth)
            diff = compute_difficulty(preds, lbl)
            records.append({"img_path": pth, "label": lbl, "difficulty": diff})
    df = pd.DataFrame(records)
    df.to_csv(os.path.join(out_dir, "ocr_difficulty_raw.csv"), index=False)

    print("🔹 Step 2: Extracting image embeddings...")
    extractor = get_feature_extractor("resnet18")
    feats, labels, paths = [], [], []
    with torch.no_grad():
        for imgs, lbls, img_paths in tqdm(DataLoader(dataset, batch_size=64)):
            imgs = imgs.to(DEVICE)
            emb = extractor(imgs).cpu().numpy()
            feats.append(emb)
            labels.extend(lbls)
            paths.extend(img_paths)
    feats = np.vstack(feats)
    df["embedding"] = [list(f) for f in feats]

    X = feats
    y = df["difficulty"].values
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42)

    print("🔹 Step 3: Training difficulty predictor model...")
    model = DifficultyRegressor(input_dim=X.shape[1]).to(DEVICE)
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.MSELoss()

    for epoch in range(20):
        model.train()
        xb = torch.tensor(X_train, dtype=torch.float32).to(DEVICE)
        yb = torch.tensor(y_train, dtype=torch.float32).to(DEVICE)
        pred = model(xb)
        loss = loss_fn(pred, yb)
        opt.zero_grad(); loss.backward(); opt.step()

        model.eval()
        with torch.no_grad():
            val_loss = loss_fn(model(torch.tensor(X_val, dtype=torch.float32).to(DEVICE)),
                               torch.tensor(y_val, dtype=torch.float32).to(DEVICE)).item()
        print(f"Epoch {epoch+1}/20 | Train Loss: {loss.item():.4f} | Val Loss: {val_loss:.4f}")

    torch.save(model.state_dict(), os.path.join(out_dir, "difficulty_predictor.pth"))

    print("🔹 Step 4: Predicting difficulty for all images...")
    model.eval()
    with torch.no_grad():
        preds = model(torch.tensor(X, dtype=torch.float32).to(DEVICE)).cpu().numpy()
    df["predicted_difficulty"] = preds
    df.to_csv(os.path.join(out_dir, "ocr_difficulty_with_pred.csv"), index=False)

    hard_df = df[df["predicted_difficulty"] >= threshold]
    hard_dir = os.path.join(out_dir, "hard_dataset")
    os.makedirs(hard_dir, exist_ok=True)

    print(f"🔹 Step 5: Exporting {len(hard_df)} hard images...")
    import shutil
    for p in hard_df["img_path"]:
        shutil.copy(p, os.path.join(hard_dir, os.path.basename(p)))

    print("✅ Phase 2 complete. Results saved in:", out_dir)
    return df, hard_df


In [None]:
DATA_ROOT = "/content/drive/MyDrive/captcha_dataset/CAPTCHACHAOS"  # change to your path
OUT_DIR = "/content/drive/MyDrive/CAPTCHA_PHASE2_OUTPUT"

df, hard_df = run_phase2(DATA_ROOT, OUT_DIR, threshold=0.8)


✅ Loaded 9875 images across 79 classes
🔹 Step 1: Running OCR ensemble and computing difficulty scores...


  3%|▎         | 10/309 [03:18<2:02:58, 24.68s/it]

In [None]:
import pandas as pd

res = pd.read_csv(f"{OUT_DIR}/ocr_difficulty_with_pred.csv")
print(res.head())

# Check hardest images (difficulty ≥ 0.8)
print("Top 5 Hard Images:")
print(res.sort_values("predicted_difficulty", ascending=False).head()[["img_path","predicted_difficulty"]])
