OCR images using Docling

In [None]:
import docling

Facial recognition using MTCNN ?

In [1]:
import os
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from facenet_pytorch import MTCNN, InceptionResnetV1
import numpy as np
from tqdm import tqdm
import re
from itertools import permutations

class NameMatcher:
    def __init__(self, names_file_path):
        """Initialize with a file containing list of valid names."""
        print(f"Loading names from {names_file_path}...")
        self.first_names = set()
        self.last_names = set()
        self.full_names = set()
        
        try:
            with open(names_file_path, 'r') as f:
                for line in f:
                    line = line.strip()
                    if not line or line.startswith('#'):
                        continue
                    
                    # Check if the line specifies the name type
                    if ':' in line:
                        name_type, name = line.split(':', 1)
                        name = name.strip().lower()
                        if name_type.strip().lower() == 'first':
                            self.first_names.add(name)
                        elif name_type.strip().lower() == 'last':
                            self.last_names.add(name)
                        elif name_type.strip().lower() == 'full':
                            self.full_names.add(name)
                    else:
                        # If no type specified, add to all sets
                        name = line.lower()
                        if ' ' in name:
                            self.full_names.add(name)
                        else:
                            self.first_names.add(name)
                            self.last_names.add(name)
            
            print(f"Loaded {len(self.first_names)} first names")
            print(f"Loaded {len(self.last_names)} last names")
            print(f"Loaded {len(self.full_names)} full names")
            
            # Create pattern for full names (including permutations)
            self._create_patterns()
            
        except FileNotFoundError:
            raise FileNotFoundError(f"Names file not found: {names_file_path}")
    
    def _create_patterns(self):
        """Create regex patterns for different name formats."""
        # Pattern for full names (exact matches)
        full_names_pattern = '|'.join(map(re.escape, sorted(self.full_names, key=len, reverse=True)))
        
        # Pattern for first+last name combinations
        first_names_pattern = '|'.join(map(re.escape, sorted(self.first_names, key=len, reverse=True)))
        last_names_pattern = '|'.join(map(re.escape, sorted(self.last_names, key=len, reverse=True)))
        
        # Combine patterns with optional middle names/initials
        self.patterns = [
            # Full names as-is
            re.compile(full_names_pattern, re.IGNORECASE) if full_names_pattern else None,
            
            # First + Last name combinations
            re.compile(f"({first_names_pattern})[^a-zA-Z]*({last_names_pattern})", re.IGNORECASE),
            
            # Last + First name combinations
            re.compile(f"({last_names_pattern})[^a-zA-Z]*({first_names_pattern})", re.IGNORECASE),
            
            # Single first names
            re.compile(f"({first_names_pattern})", re.IGNORECASE),
            
            # Single last names
            re.compile(f"({last_names_pattern})", re.IGNORECASE)
        ]
        
        self.patterns = [p for p in self.patterns if p is not None]
    
    def find_name(self, filename):
        """Find any matching names in the filename."""
        # Remove extension and clean filename
        base_name = re.sub(r'\.[^.]+$', '', filename)
        base_name = re.sub(r'[_-]', ' ', base_name)  # Convert underscores/hyphens to spaces
        
        best_match = None
        best_match_length = 0
        
        # Try each pattern in order of preference
        for pattern in self.patterns:
            matches = pattern.finditer(base_name.lower())
            for match in matches:
                # For patterns with groups, combine the groups
                if len(match.groups()) > 1:
                    name_parts = [g for g in match.groups() if g]
                    matched_name = ' '.join(name_parts)
                else:
                    matched_name = match.group(0)
                
                # Keep the longest match
                if len(matched_name) > best_match_length:
                    best_match = matched_name
                    best_match_length = len(matched_name)
        
        if best_match:
            return True, best_match
        return False, None

class FaceDataset(Dataset):
    def __init__(self, image_dir, names_file_path):
        self.image_paths = []
        self.labels = []
        self.name_matcher = NameMatcher(names_file_path)
        
        # Initialize face detection model
        self.mtcnn = MTCNN(
            image_size=160,
            margin=10,
            keep_all=True,
            min_face_size=20,
            device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        )
        
        # Filter images and extract labels
        print("Scanning and filtering images...")
        skipped_no_name = 0
        skipped_multiple_faces = 0
        found_names = {}
        
        for filename in tqdm(os.listdir(image_dir)):
            if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.webp')):
                image_path = os.path.join(image_dir, filename)
                
                # Extract name from filename
                has_name, extracted_name = self.name_matcher.find_name(filename)
                
                if not has_name:
                    skipped_no_name += 1
                    if skipped_no_name <= 5:
                        print(f"Skipping {filename}: No known name found")
                    continue
                
                # Check number of faces
                if not self._has_single_face(image_path):
                    skipped_multiple_faces += 1
                    if skipped_multiple_faces <= 5:
                        print(f"Skipping {filename}: Multiple faces detected or no face found")
                    continue
                
                self.image_paths.append(image_path)
                self.labels.append(extracted_name)
                found_names[extracted_name] = found_names.get(extracted_name, 0) + 1
        
        self._print_summary(found_names, skipped_no_name, skipped_multiple_faces)
        
        # Convert names to numerical labels
        unique_names = list(found_names.keys())
        self.name_to_idx = {name: idx for idx, name in enumerate(unique_names)}
        self.labels = [self.name_to_idx[name] for name in self.labels]
        
        # Reinitialize MTCNN for single face detection
        self.mtcnn = MTCNN(
            image_size=160,
            margin=10,
            keep_all=False,
            min_face_size=20,
            device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        )

    def _print_summary(self, found_names, skipped_no_name, skipped_multiple_faces):
        print(f"\nDataset Summary:")
        print(f"Found {len(self.image_paths)} valid images with single faces and valid names")
        print(f"Skipped {skipped_no_name} images with no matching names")
        print(f"Skipped {skipped_multiple_faces} images with multiple faces or no faces")
        print(f"Unique names found: {len(found_names)}")
        
        if found_names:
            print("\nMost common names found:")
            for name, count in sorted(found_names.items(), key=lambda x: x[1], reverse=True)[:5]:
                print(f"  {name}: {count} images")

    def _has_single_face(self, image_path):
        try:
            image = Image.open(image_path)
            boxes, _ = self.mtcnn.detect(image)
            return boxes is not None and len(boxes) == 1
        except Exception as e:
            print(f"Error processing {image_path}: {str(e)}")
            return False

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx])
        try:
            face = self.mtcnn(image)
            if face is None:
                return torch.zeros((3, 160, 160)), self.labels[idx]
            return face, self.labels[idx]
        except Exception as e:
            print(f"Error processing {self.image_paths[idx]}: {str(e)}")
            return torch.zeros((3, 160, 160)), self.labels[idx]

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def train_model(image_dir, names_file_path, batch_size=32, num_epochs=10, learning_rate=0.001):
    """
    Train the facial recognition model.
    
    Args:
        image_dir (str): Directory containing face images
        names_file_path (str): Path to the file containing valid names
        batch_size (int): Batch size for training
        num_epochs (int): Number of training epochs
        learning_rate (float): Learning rate for optimization
    
    Returns:
        model: Trained model
        dataset: The dataset object for reference
    """
    # Initialize dataset
    dataset = FaceDataset(image_dir, names_file_path)
    if len(dataset) == 0:
        raise ValueError("No valid images found in the dataset")
    
    # Create data loader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=4
    )
    
    # Initialize model
    model = InceptionResnetV1(pretrained='vggface2')
    num_classes = len(dataset.name_to_idx)
    # Replace the last layer to match our number of classes
    model.last_linear = torch.nn.Linear(512, num_classes)
    
    if torch.cuda.is_available():
        model = model.cuda()
        print("Using GPU for training")
    
    # Loss function and optimizer
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    # Training loop
    print(f"\nStarting training with {num_epochs} epochs...")
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        
        progress_bar = tqdm(dataloader, desc=f'Epoch {epoch+1}/{num_epochs}')
        for i, (faces, labels) in enumerate(progress_bar):
            if torch.cuda.is_available():
                faces = faces.cuda()
                labels = labels.cuda()
            
            # Zero the gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(faces)
            loss = criterion(outputs, labels)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            # Statistics
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            # Update progress bar
            progress_bar.set_postfix({
                'loss': f'{running_loss/(i+1):.3f}',
                'acc': f'{100*correct/total:.2f}%'
            })
        
        # Epoch summary
        epoch_loss = running_loss / len(dataloader)
        epoch_acc = 100 * correct / total
        print(f'Epoch {epoch+1} - Loss: {epoch_loss:.3f}, Accuracy: {epoch_acc:.2f}%')
    
    print("Training completed!")
    return model, dataset

def save_model(model, save_path):
    """Save the trained model."""
    torch.save(model.state_dict(), save_path)
    print(f"Model saved to {save_path}")

def load_model(model_path, num_classes):
    """Load a trained model."""
    model = InceptionResnetV1(pretrained=None)
    model.last_linear = torch.nn.Linear(512, num_classes)
    model.load_state_dict(torch.load(model_path))
    return model

# Example usage:
if __name__ == "__main__":
    # Training
    model, dataset = train_model(
        image_dir="C:/Users/Mikae/Documents/MEGA/Nedladdade från webb etc/1",
        names_file_path="path/to/names.txt",
        batch_size=32,
        num_epochs=10
    )
    
    # Save the model
    save_model(model, "facial_recognition_model.pth")
    
    # Later, to load the model
    loaded_model = load_model("facial_recognition_model.pth", len(dataset.name_to_idx))