In [None]:
!pip install easyocr --quiet

In [None]:
import zipfile
import os

# Path to your uploaded zip file
zip_path = "/content/drive/MyDrive/arabic_images_3.zip"  # Change this to your actual zip file name
extract_dir = "/content/unzipped_data"

# Create the target extraction directory if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

# Unzip while preserving folder structure
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print("✅ Extraction completed.")


In [None]:
import os
import re
import shutil
from PIL import Image, UnidentifiedImageError
import easyocr
from tqdm import tqdm

# === Folder Setup ===
INPUT_FOLDER = "/content/unzipped_data"  # Root of your unzipped structure
# /content/drive/MyDrive/SOLVERSE
OUTPUT_FOLDER = "/content/drive/MyDrive/arabic_image_filtered"
PASSED_FOLDER = os.path.join(OUTPUT_FOLDER, "passed")
FAILED_FOLDER = os.path.join(OUTPUT_FOLDER, "failed")

os.makedirs(PASSED_FOLDER, exist_ok=True)
os.makedirs(FAILED_FOLDER, exist_ok=True)

# === Regex Setup ===
arabic_letter_pattern = re.compile(r'[\u0621-\u064A\u066E\u066F\u0671-\u06D3\u06FA-\u06FC]')
allowed_chars_pattern = re.compile(
    r'^[\u0621-\u064A\u066E\u066F\u0671-\u06D3\u06FA-\u06FC'  # Arabic letters
    r'\u0660-\u0669'  # Arabic-Indic digits
    r'0-9'            # ASCII digits
    r'\s\.,؛؟!\'\"\(\)\[\]\{\}%٪:-]+$'  # Punctuation & spaces
)

def is_valid_arabic_text(text: str) -> bool:
    text = text.strip()
    if not text:
        return False
    if not arabic_letter_pattern.search(text):
        return False
    if not allowed_chars_pattern.fullmatch(text):
        return False
    return True

# === OCR Reader ===
reader = easyocr.Reader(['ar'], gpu=True)

# === Process Images ===
count_passed = 0
count_failed = 0
total_images = 0

for root, _, files in os.walk(INPUT_FOLDER):
    for fname in tqdm(files, desc="🔎 Processing images"):
        if not fname.lower().endswith((".jpg", ".jpeg", ".png")):
            continue

        total_images += 1
        src_path = os.path.join(root, fname)

        try:
            image = Image.open(src_path).convert("RGB")
        except UnidentifiedImageError:
            continue

        try:
            result = reader.readtext(src_path, detail=0, paragraph=True)
            text = " ".join(result).strip()

            if is_valid_arabic_text(text):
                shutil.copy2(src_path, os.path.join(PASSED_FOLDER, fname))
                count_passed += 1
            else:
                shutil.copy2(src_path, os.path.join(FAILED_FOLDER, fname))
                count_failed += 1
        except Exception as e:
            print(f"Error with {fname}: {e}")
            continue

print(f"\n✅ DONE: Passed = {count_passed}, Failed = {count_failed}, Total = {total_images}")
