In [3]:
import os
import cv2
import numpy as np
import torch
from ultralytics import YOLO
import matplotlib.pyplot as plt
from IPython.display import display
import random
import pytesseract
from pathlib import Path
import pandas as pd

In [4]:
torch.cuda.is_available()

True

In [5]:
import kagglehub
#dataset_path = kagglehub.dataset_download("jessicali9530/lfw-dataset")
dataset_path = kagglehub.dataset_download("chiragsaipanuganti/morph")



In [6]:
dataset_path

'/root/.cache/kagglehub/datasets/chiragsaipanuganti/morph/versions/2'

In [7]:
image_paths = []
for root, dirs, files in os.walk(dataset_path):
    for file in files:
        if file.lower().endswith(('.jpg')):
            image_paths.append(os.path.join(root, file))

In [8]:
def populate_whitecard(image_paths, naming, height=1540, width=1754, im_width=250, im_height=250, spacing=80):
    whitecard = np.ones((height, width, 3), dtype=np.uint8) * 255
    x = 10
    y = 10

    diagonal_size = int(np.sqrt(im_width**2 + im_height**2)) + 10

    for i, name in zip(image_paths, naming):
        img = cv2.imread(i)
        img = cv2.resize(img, (im_width, im_height))
        print(i)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        rot_canvas = np.ones((diagonal_size, diagonal_size, 3), dtype=np.uint8) * 255
        start_x = (diagonal_size - im_width) // 2
        start_y = (diagonal_size - im_height) // 2
        rot_canvas[start_y:start_y + im_height, start_x:start_x + im_width] = img

        rot_angle = random.uniform(-180, 180)
        center = (diagonal_size // 2, diagonal_size // 2)
        rot_matrix = cv2.getRotationMatrix2D(center, rot_angle, 1.0)
        rot_img = cv2.warpAffine(rot_canvas, rot_matrix, (diagonal_size, diagonal_size), borderValue=(255, 255, 255))

        whitecard[y:y+diagonal_size, x:x+diagonal_size] = rot_img

        cv2.putText(whitecard, str(name), (x + diagonal_size // 2, y + diagonal_size + (spacing // 2)), 
                   cv2.FONT_HERSHEY_SCRIPT_COMPLEX, 1.3, (0, 0, 0), 2, cv2.LINE_AA)
        
        x += diagonal_size + 40
        if x > (width-diagonal_size):
            x = 10
            y += diagonal_size + spacing
        if y > (height-diagonal_size):
            return whitecard
    return whitecard

In [9]:
def get_cards(image_paths, num, im_width, im_height):
    cards = []
    random.shuffle(image_paths)
    for i in range(num):
        card = populate_whitecard(image_paths[i*20:(i+1)*20], [f"{i*20+j+1}" for j in range(20)], im_height=im_height, im_width=im_width)
        cards.append(card)
    return cards

In [10]:
cards = get_cards(image_paths, 10, im_width=200, im_height=200)

output_path = Path("cards")
output_path.mkdir(exist_ok=True)

for i, card in enumerate(cards):
    card_bgr = cv2.cvtColor(card, cv2.COLOR_RGB2BGR)
    cv2.imwrite(f'cards/card_{i+1}.jpg', card_bgr)

/root/.cache/kagglehub/datasets/chiragsaipanuganti/morph/versions/2/Dataset/Images/Test/109931_0M32.JPG
/root/.cache/kagglehub/datasets/chiragsaipanuganti/morph/versions/2/Dataset/Images/Train/01505_05M28.JPG
/root/.cache/kagglehub/datasets/chiragsaipanuganti/morph/versions/2/Dataset/Images/Validation/276845_01M20.JPG
/root/.cache/kagglehub/datasets/chiragsaipanuganti/morph/versions/2/Dataset/Images/Train/328195_00M16.JPG
/root/.cache/kagglehub/datasets/chiragsaipanuganti/morph/versions/2/Dataset/Images/Train/307848_01M25.JPG
/root/.cache/kagglehub/datasets/chiragsaipanuganti/morph/versions/2/Dataset/Images/Train/27366_02F19.JPG
/root/.cache/kagglehub/datasets/chiragsaipanuganti/morph/versions/2/Dataset/Images/Train/137577_4M34.JPG
/root/.cache/kagglehub/datasets/chiragsaipanuganti/morph/versions/2/Dataset/Images/Train/39447_03F24.JPG
/root/.cache/kagglehub/datasets/chiragsaipanuganti/morph/versions/2/Dataset/Images/Train/17019_01M29.JPG
/root/.cache/kagglehub/datasets/chiragsaipanugan

In [11]:
model = YOLO('yolo11n.pt')  # nano
model = YOLO('yolo11s.pt')  # small
model = YOLO('yolo11m.pt')  # medium
model = YOLO('yolo11l.pt')  # large
model = YOLO('yolo11x.pt')  # extra large

In [12]:
model = YOLO('yolo11n-pose.pt')  # nano
model = YOLO('yolo11s-pose.pt')  # small
model = YOLO('yolo11m-pose.pt')  # medium
model = YOLO('yolo11l-pose.pt')  # large
model = YOLO('yolo11x-pose.pt')  # extra large

In [None]:
class CardProcessor:
    def __init__(self, detection_model="yolo11x.pt", pose_model="yolo11x-pose.pt"):
        self.detection_model = YOLO(detection_model)
        self.pose_model = YOLO(pose_model)
    
    def process_card(self, card_path):
        card = cv2.imread(card_path)

        image_regions = self.detect_image_regions(card)

        print(f"Found {len(image_regions)} image regions.")

        results = []
        valid_count = 0
        for i, region in enumerate(image_regions):
            crop = self.extract_crop(card, region)
            if crop.shape[0] < 50 or crop.shape[1] < 50:
                continue
        
            rotated_crop, angle = self.correct_orientation(crop)
        
            is_valid = self.validate_photo(rotated_crop)

            if is_valid:
                valid_count += 1
                id = self.find_id(card, region)

                results.append({
                    'id': id,
                    'rotated_crop': rotated_crop,
                    'angle': angle,
                    'source_card': Path(card_path).name,
                    'region_idx': i,
                    'region': region
                })
        print(f"Valid faces found: {valid_count}/{len(image_regions)}")
        return results
    
    def detect_image_regions(self, card):
        #gray = cv2.cvtColor(card, cv2.COLOR_BGR2GRAY)
#
        #regions = []
#
        #edges1 = cv2.Canny(gray, 30, 100, apertureSize=3)
        #contours1, _ = cv2.findContours(edges1, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        #regions.extend(self.extract_regions(contours1, card.shape))
#
        #print(len(regions), "regions found after Canny edge detection.")
#
        #adaptive2 = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
        #edges2 = cv2.Canny(adaptive2, 50, 150)
        #contours2, _ = cv2.findContours(edges2, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        #regions.extend(self.extract_regions(contours2, card.shape))
#
        #print(len(regions), "regions found after adaptive thresholding and Canny edge detection.")
#
        #kernel3 = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5))
        #morph3 = cv2.morphologyEx(gray, cv2.MORPH_CLOSE, kernel3)
        #edges3 = cv2.Canny(morph3, 50, 150)
        #contours3, _ = cv2.findContours(edges3, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        #regions.extend(self.extract_regions(contours3, card.shape))
#
        #print(len(regions), "regions found after morphological operations and Canny edge detection.")

        results = self.detection_model(card, classes=[0], verbose=False)

        regions = []

        for result in results:
            if result.boxes is not None:
                for box in result.boxes:
                    if box.conf[0] > 0.35:
                        x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
                        x, y, w, h = int(x1), int(y1), int(x2 - x1), int(y2 - y1)
                        regions.append((x, y, w, h))

        print(f"Regions detected by YOLO: {len(regions)}")  

        print(f"Total regions before deduplication: {len(regions)}")
        unique_regions = self.remove_duplicate_regions(regions)
        print(f"Unique regions after deduplication: {len(unique_regions)}")
        #valid_regions = self.filter_valid_regions(unique_regions, card.shape)
        #print(f"Valid regions after filtering: {len(valid_regions)}")

        final_regions = sorted(unique_regions, key=lambda r: (r[1], r[0]))

        return final_regions
    
    def extract_regions(self, contours, card_shape):
        regions = []
        card_area = card_shape[0] * card_shape[1]

        for contour in contours:
            area = cv2.contourArea(contour)

            if area < card_area * 0.002 or area > card_area * 0.5:
                continue

            epsilon = 0.02 * cv2.arcLength(contour, True)
            approx = cv2.approxPolyDP(contour, epsilon, True)

            if len(approx) >= 4:
                x, y, w, h = cv2.boundingRect(approx)

                ratio = w / h if h > 0 else 0

                if 0.3 < ratio < 3.0:
                    regions.append((x, y, w, h))

        return regions
    
    def remove_duplicate_regions(self, regions):
        if not regions:
            return []
        
        unique_regions = []

        for region in regions:
            is_duplicate = False
            x1, y1, w1, h1 = region

            for registered in unique_regions:
                x2, y2, w2, h2 = registered

                iou = self.calculate_iou((x1, y1, w1, h1), (x2, y2, w2, h2))

                if iou > 0.7:
                    is_duplicate = True
                    break

            if not is_duplicate:
                unique_regions.append(region)

        return unique_regions
    
    def calculate_iou(self, region1, region2):
        x1, y1, w1, h1 = region1
        x2, y2, w2, h2 = region2

        x_intersect = max(x1, x2)
        y_intersect = max(y1, y2)
        w_intersect = min(x1 + w1, x2 + w2)
        h_intersect = min(y1 + h1, y2 + h2)

        if w_intersect <= x_intersect or h_intersect <= y_intersect:
            return 0
        
        intersection = (w_intersect - x_intersect) * (h_intersect - y_intersect)

        area1 = w1 * h1
        area2 = w2 * h2
        union = area1 + area2 - intersection

        return intersection / union if union > 0 else 0
    
    def filter_valid_regions(self, regions, card_shape):
        valid_regions = []
        card_h, card_w = card_shape[:2]
        min_size = 100
        max_size = min(card_h, card_w) // 3

        for x, y, w, h in regions:
            aspect_ratio = max(w, h) / min(w, h)
            if aspect_ratio > 2.0:
                continue
            if min_size <= min(w, h) and max(w, h) <= max_size:
                margin = 5
                if (margin < x < card_w - w - margin and
                    margin < y < card_h - h - margin):
                    valid_regions.append((x, y, w, h))

        return valid_regions
    
    def find_id(self, card, region):
        x, y, w, h = region

        search_areas = []

        below_y = y + h + 10
        below_h = min(120, card.shape[0] - below_y)

        if below_y + below_h <= card.shape[0]:
            search_x = max(0, x - 20)
            search_w = min(card.shape[1] - search_x, w + 40)
            search_areas.append(('below', (search_x, below_y, search_w, below_h)))
        
        #above_h = min(60, y)
        #if above_h > 10:
        #    search_areas.append(('above', (x, max(0, y - above_h), w, above_h)))

        #right_x = x + w + 2
        #right_w = min(100, card.shape[1] - right_x)
        #if right_x + right_w <= card.shape[1]:
        #    search_areas.append(('right', (right_x, y, right_w, h)))
        
        #left_w = min(100, x)
        #if left_w > 10:
        #    search_areas.append(('left', (max(0, x - left_w), y, left_w, h)))

        for location, (sx, sy, sw, sh) in search_areas:
            id = self.extract_id(card, (sx, sy, sw, sh))

            if id not in ["ID_NOT_FOUND", "ID_NOT_DETECTED", "OCR_ERROR"]:
                return id
            
        return "ID_NOT_DETECTED"
    
    def extract_id(self, card, region):
        x, y, w, h = region

        x = max(0, x)
        y = max(0, y)
        w = min(w, card.shape[1] - x)
        h = min(h, card.shape[0] - y)

        if w <= 0 or h <= 0:
            print(f"Debug - Invalid crop dimensions for OCR: w={w}, h={h}")
            return "ID_NOT_FOUND"
        
        id_crop = card[y:y+h, x:x+w]

        try:
            gray = cv2.cvtColor(id_crop, cv2.COLOR_BGR2GRAY)
            
            all_confident_digits = []

            preproc_methods = {
                'simple_binary': lambda g: cv2.threshold(g, 127, 255, cv2.THRESH_BINARY)[1],
                'otsu': lambda g: cv2.threshold(g, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1],
                'adaptive_gaussian': lambda g: cv2.adaptiveThreshold(g, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 5)
            }

            tesseract_configs = [
                r'--oem 3 --psm 6 -c tessedit_char_whitelist=0123456789',
                r'--oem 3 --psm 7 -c tessedit_char_whitelist=0123456789',
                r'--oem 3 --psm 8 -c tessedit_char_whitelist=0123456789',
                r'--oem 3 --psm 13 -c tessedit_char_whitelist=0123456789'
            ]
            
            min_crop_dim_for_ocr = 10
            if gray.shape[0] < min_crop_dim_for_ocr or gray.shape[1] < min_crop_dim_for_ocr:
                print(f"Debug - OCR crop too small: {gray.shape}")
                return "ID_NOT_DETECTED"

            for method_name, preproc_func in preproc_methods.items():
                try:
                    thresh_img = preproc_func(gray.copy())
                except cv2.error as e:
                    print(f"Debug - OpenCV error in {method_name} preprocessing: {e}")
                    continue

                for scale_factor in [2, 3]:
                    scaled_h, scaled_w = thresh_img.shape[0] * scale_factor, thresh_img.shape[1] * scale_factor
                    if scaled_h == 0 or scaled_w == 0:
                        continue
                    scaled_img = cv2.resize(thresh_img, (scaled_w, scaled_h), interpolation=cv2.INTER_CUBIC)

                    for config_str in tesseract_configs:
                        try:
                            data = pytesseract.image_to_data(scaled_img, config=config_str, output_type=pytesseract.Output.DICT)
                            page_text_segments = []
                            for i in range(len(data['text'])):
                                text_segment = data['text'][i].strip()
                                confidence = int(data['conf'][i])
                                if confidence > 50 and text_segment:
                                    digits_in_segment = ''.join(filter(str.isdigit, text_segment))
                                    if digits_in_segment:
                                        page_text_segments.append(digits_in_segment)
                            if page_text_segments:
                                combined_digits = "".join(page_text_segments)
                                if combined_digits:
                                     all_confident_digits.append(combined_digits)
                        except RuntimeError:
                            pass
                        except Exception:
                            pass

            if all_confident_digits:
                from collections import Counter
                
                if len(all_confident_digits) > 5:
                    lengths = [len(s) for s in all_confident_digits]
                    median_len = sorted(lengths)[len(lengths)//2]
                    reasonable_results = [r for r in all_confident_digits if abs(len(r) - median_len) <= 1 or len(r) == median_len]
                    if not reasonable_results:
                        reasonable_results = all_confident_digits
                else:
                    reasonable_results = all_confident_digits

                if not reasonable_results:
                     print(f"Debug - No reasonable_results from all_confident_digits: {all_confident_digits}")
                     return "ID_NOT_DETECTED"

                counts = Counter(reasonable_results)
                sorted_by_freq = counts.most_common()

                if not sorted_by_freq:
                    print(f"Debug - No results after Counter on reasonable_results: {reasonable_results}")
                    return "ID_NOT_DETECTED"

                chosen_id = sorted_by_freq[0][0] 

                if len(sorted_by_freq) > 1:
                    val1, c1 = sorted_by_freq[0]
                    val2, c2 = sorted_by_freq[1]
                    
                    # Apply suffix heuristic if counts are close (c1 is not much larger than c2)
                    if c1 <= c2 * 1.5 : # e.g., 8 vs 6 is true (8 <= 9), 10 vs 6 is false (10 <= 9)
                        # Check if val2 (second most common) is shorter by one char and a suffix of val1 (most common)
                        if len(val1) == len(val2) + 1 and val1.endswith(val2):
                            chosen_id = val2
                            print(f"Debug - Suffix Heuristic Applied: Chose '{val2}' (count {c2}) over '{val1}' (count {c1})")
                        # Check if val1 (most common) is shorter by one char and a suffix of val2 (second most common)
                        # This case is less likely if val1 is already the most_common, but good for completeness
                        elif len(val2) == len(val1) + 1 and val2.endswith(val1):
                            # chosen_id is already val1, so no change, but we can log it
                            print(f"Debug - Suffix Heuristic Confirmed: '{val1}' (count {c1}) is shorter suffix of '{val2}' (count {c2})")
                
                print(f"Debug - All confident digits: {all_confident_digits}, Filtered (reasonable): {reasonable_results}, Chosen: {chosen_id}")
                return chosen_id
            else:
                print(f"Debug - No confident digits found after all attempts.")
                return "ID_NOT_DETECTED"
        
        except Exception as e:
            print(f"Debug - OCR_ERROR exception in extract_id: {e}")
            return "OCR_ERROR"
        
    def correct_orientation(self, crop):
        best_crop = crop.copy()
        best_angle = 0
        best_score = 0

        angles = list(range(0, 360, 15))

        for angle in angles:
            if angle == 0:
                rotated = crop
            else:
                rotated = self.rotate_image(crop, angle)

            orientation_data = self.analyze_face_orientation(rotated)

            if orientation_data['has_face']:
                score = self.calculate_orientation_score(orientation_data)
                if score > best_score:
                    best_score = score
                    best_crop = rotated.copy()
                    best_angle = angle

        if best_angle > 0:
            angles = list(range(best_angle - 10, best_angle + 11, 1))

            for angle in angles:
                rotated = self.rotate_image(crop, angle)
                orientation_data = self.analyze_face_orientation(rotated)

                if orientation_data['has_face']:
                    score = self.calculate_orientation_score(orientation_data)
                    if score > best_score:
                        best_score = score
                        best_crop = rotated.copy()
                        best_angle = angle

        return best_crop, best_angle

    def analyze_face_orientation(self, image):

        try:
            results = self.pose_model(image, verbose=False)

            orientation_data = {
                'has_face': False,
                'confidence': 0,
                'face_upright': False,
                'keypoints': {},
            }

            for result in results:
                keypoints = result.keypoints
                boxes = result.boxes

                if keypoints is not None and boxes is not None and len(boxes) > 0:
                    best_idx = torch.argmax(boxes.conf).item()
                    kp = keypoints[best_idx]
                    box_conf = boxes.conf[best_idx].item()

                    nose = kp.xy[0][0]
                    left_eye = kp.xy[0][1]
                    right_eye = kp.xy[0][2]

                    valid_keypoints = 0
                    if len(nose) == 2 and nose[0] > 0 and nose[1] > 0:
                        orientation_data['keypoints']['nose'] = nose.tolist()
                        valid_keypoints += 1

                    if len(left_eye) == 2 and left_eye[0] > 0 and left_eye[1] > 0:
                        orientation_data['keypoints']['left_eye'] = left_eye.tolist()
                        valid_keypoints += 1

                    if len(right_eye) == 2 and right_eye[0] > 0 and right_eye[1] > 0:
                        orientation_data['keypoints']['right_eye'] = right_eye.tolist()
                        valid_keypoints += 1

                    if valid_keypoints >= 2:
                        orientation_data['has_face'] = True
                        orientation_data['confidence'] = box_conf

                        if 'nose' in orientation_data['keypoints']:
                            nose_y = orientation_data['keypoints']['nose'][1]
                            eye_above = True

                            for eye in ['left_eye', 'right_eye']:
                                if eye in orientation_data['keypoints']:
                                    if orientation_data['keypoints'][eye][1] >= nose_y:
                                        eye_above = False

                            orientation_data['face_upright'] = eye_above

                        break
            return orientation_data
        
        except Exception as e:
            return {'has_face': False, 'confidence': 0, 'face_upright': False, 'keypoints': {}}
        
    def calculate_orientation_score(self, orientation_data):
        if not orientation_data['has_face']:
            return 0
        
        score = orientation_data['confidence'] * 100

        if orientation_data['face_upright']:
            score += 50

        keypoint_count = len(orientation_data['keypoints'])
        score += keypoint_count * 5

        return score
    
    def rotate_image(self, image, angle):
        h, w = image.shape[:2]
        center = (w // 2, h // 2)

        rotation_matrix = cv2.getRotationMatrix2D(center, angle, 1.0)

        cos_val = abs(rotation_matrix[0, 0])
        sin_val = abs(rotation_matrix[0, 1])
        new_w = int((h * sin_val) + (w * cos_val))
        new_h = int((h * cos_val) + (w * sin_val))

        rotation_matrix[0, 2] += (new_w / 2) - center[0]
        rotation_matrix[1, 2] += (new_h / 2) - center[1]

        rotated = cv2.warpAffine(image, rotation_matrix, (new_w, new_h), borderMode=cv2.BORDER_CONSTANT, borderValue=(255, 255, 255))

        return rotated
    
    def extract_crop(self, card, region):
        x, y, w, h = region
        padding = 5
        x = max(0, x - padding)
        y = max(0, y - padding)
        w = min(card.shape[1] - x, w + 2 * padding)
        h = min(card.shape[0] - y, h + 2 * padding)

        crop = card[y:y+h, x:x+w]
        return crop
    
    def validate_photo(self, crop):
        try:
            results = self.detection_model(crop, classes=[0], verbose=False)

            for result in results:
                if result.boxes is not None and len(result.boxes) > 0:
                    max_conf = max([box.conf[0].item() for box in result.boxes])
                    return max_conf > 0.1
                
            return False
        except:
            return False
        

def process_cards(input_dir, output_dir):
    processor = CardProcessor()

    input_path = Path(input_dir)
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)

    all_images_info = []

    for image_file in input_path.glob('*.jpg'):
        print(f"Processing {image_file.name}...")
        results = processor.process_card(str(image_file))

        for result in results:
            id = result['id']
            if id in ["ID_NOT_FOUND", "ID_NOT_DETECTED", "OCR_ERROR"]:
                filename = f"unknown_{result['source_card']}_{result['region_idx']}.jpg"
            else:
                filename = f"{id}.jpg"

            output_file = output_path / filename

            counter = 1
            while output_file.exists():
                if id in ["ID_NOT_FOUND", "ID_NOT_DETECTED", "OCR_ERROR"]:
                    filename = f"unknown_{result['source_card']}_{result['region_idx']}_{counter}.jpg"
                else:
                    filename = f"{id}_{counter}.jpg"
                output_file = output_path / filename
                counter += 1
            
            cv2.imwrite(str(output_file), result['rotated_crop'])

            all_images_info.append({
                'id': result['id'],
                'filename': filename,
                'source_card': result['source_card'],
                'region_idx': result['region_idx'],
                'angle': result['angle'],
                'region_x': result['region'][0],
                'region_y': result['region'][1],
                'region_w': result['region'][2],
                'region_h': result['region'][3],
            })

            print(f"  Saved: {filename} (ID: {result['id']}, rotated: {result['angle']}°)")

    if all_images_info:
        df = pd.DataFrame(all_images_info)
        df.to_csv(output_path / 'images_info.csv', index=False)

        print(f"\nProcessing complete!")
        print(f"Total images saved: {len(all_images_info)}")
        print(f"All images saved to: {output_path}")
        print(f"Image info saved to: {output_path / 'all_images.csv'}")

    return all_images_info

In [22]:
results = process_cards('cards', 'extracted')

Processing card_2.jpg...


RuntimeError: CUDA error: unknown error
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
