In [2]:
# ==================================================================================#
#                                   Import Required Libraries                       #
# ==================================================================================#
import cv2
import numpy as np
import os
from skimage import measure
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
import joblib
# ==================================================================================#
#                                 Signature Detection Function                      #
# ==================================================================================#

def detect_signature(image):
    """
    Detects the signature in a image and returns the extracted signature.

    Parameters:
        image (numpy.ndarray): The input image in which to detect the signature.

    Returns:
         Cropped image containing the signature if detected, otherwise None.
    """
    # Get image dimensions
    height, width, _ = image.shape

    # Define cropping percentages for each edge 
    top_crop_pct = 45    # Crop 45% from the top
    bottom_crop_pct = 20 # Crop 20% from the bottom
    side_crop_pct = 10   # Crop 10% from the sides

    # Calculate pixel values to crop
    top_cut = int((top_crop_pct / 100) * height)
    bottom_cut = int((bottom_crop_pct / 100) * height)
    side_cut = int((side_crop_pct / 100) * width)

    # Apply cropping to focus on the area where the signature is likely to be
    cropped_image = image[top_cut:height - bottom_cut, side_cut:width - side_cut]

    # Convert to grayscale
    gray = cv2.cvtColor(cropped_image, cv2.COLOR_BGR2GRAY)

    # Binarize the image using Otsu's thresholding
    _, binary_image = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    # Define a kernel for morphological operations
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 3))

    # Dilate the image to connect components
    dilated_image = cv2.dilate(binary_image, kernel, iterations=2)

    # Label connected regions
    blobs_labels = measure.label(dilated_image, connectivity=2, background=0)
    regions = measure.regionprops(blobs_labels)

    # Calculate areas of connected components
    areas = [region.area for region in regions]

    # Calculate the average area
    total_area = sum(areas)
    counter = len(areas)
    average_area = total_area / counter if counter > 0 else 0

    # Threshold to filter out small components
    small_size_threshold = 4 * average_area

    # Create a mask for the significant components
    mask = np.zeros(blobs_labels.shape, dtype=np.uint8)

    for region in regions:
        # Get the bounding box of the region
        minr, minc, maxr, maxc = region.bbox
        aspect_ratio = (maxc - minc) / (maxr - minr)

        # Keep components that are large enough and have a suitable aspect ratio
        if region.area >= small_size_threshold and aspect_ratio > 1.5:
            mask[blobs_labels == region.label] = 255

    # Find contours in the mask
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    if contours:
        # The largest contour is probably the signature
        largest_contour = max(contours, key=cv2.contourArea)
        x, y, w, h = cv2.boundingRect(largest_contour)

        # Adjust coordinates according to the original image
        x_original = x + side_cut
        y_original = y + top_cut

        # Extract the signature region
        signature_cropped = image[y_original:y_original + h, x_original:x_original + w]
        
        return signature_cropped
    else:
        print("No signature detected in the image.")
        return None
# ==================================================================================#
#                 Function to extract SIFT features from an image                   #
# ==================================================================================#
def extract_sift_features(image):
    # Convert image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Initialize SIFT detector
    sift = cv2.SIFT_create()

    # Detect keypoints and descriptors
    keypoints, descriptors = sift.detectAndCompute(gray, None)

    return keypoints, descriptors
# ==================================================================================#
#                        Function to upscale small images                           #
# ==================================================================================#

def upscale_image(image, scale_factor=3):
    height, width = image.shape[:2]
    new_height, new_width = int(height * scale_factor), int(width * scale_factor)
    upscaled_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_CUBIC)
    return upscaled_image
# ==================================================================================#
#                             Loading Signature Database                            #
# ==================================================================================#

# Path to the signature database
signature_database_path = "signature_database"  # Replace with your actual path

# List to store signatures from the database
database_signatures = []

# Load and process each signature in the database
for folder in sorted(os.listdir(signature_database_path)):
    employee_folder_path = os.path.join(signature_database_path, folder)
    if os.path.isdir(employee_folder_path):
        for file in sorted(os.listdir(employee_folder_path)):
            if file.lower().endswith((".png", ".jpg", ".jpeg", ".tif")):
                image_path = os.path.join(employee_folder_path, file)
                signature_image = cv2.imread(image_path)
                if signature_image is not None:
                    # Upscale if the image is too small
                    height, width = signature_image.shape[:2]
                    if height < 100 or width < 100:
                        signature_image = upscale_image(signature_image, scale_factor=3)

                    # Extract SIFT features
                    keypoints, descriptors = extract_sift_features(signature_image)
                    if descriptors is not None and len(keypoints) > 0:
                        database_signatures.append({
                            'keypoints': keypoints,
                            'descriptors': descriptors,
                            'label': folder,
                            'image': signature_image
                        })
                    else:
                        print(f"No descriptors found in image: {image_path}")
                else:
                    print(f"Could not load image: {image_path}")

print(f"SIFT features extracted for {len(database_signatures)} database signatures.")
# ==================================================================================#
#                             Train Isolation Forest Model                          #
# ==================================================================================#

def train_isolation_forest(database_signatures):
    """
    Train an Isolation Forest to detect signatures that do not belong to the database.

    Parameters:
        database_signature: List of signatures with their SIFT descriptors.

    Returns:
        IsolationForest: Trained Isolation Forest model.
    """
    # Collect all descriptors from the database into a single array
    all_descriptors = np.vstack([data['descriptors'] for data in database_signatures if data['descriptors'] is not None])

    # Initialize and train the Isolation Forest model
    iso_forest = IsolationForest(contamination=0.25,  n_estimators=150, max_samples='auto',random_state=42)
    iso_forest.fit(all_descriptors)

    return iso_forest

# Train the Isolation Forest using the existing database signatures
iso_forest_model = train_isolation_forest(database_signatures)

# ==================================================================================#
#   Function to verify the signature against the database using Isolation Forest    #
# ==================================================================================#

def verify_signature_with_isolation_forest(signature_image):
    """
    Verify if the extracted signature matches any in the database using SIFT and Isolation Forest.

    Parameters:
        signature_image: The cropped signature image.

    Returns:
        str: The label of the best match, or 'None' if no good match is found.
    """
    # Extract SIFT features from the signature image
    keypoints_query, descriptors_query = extract_sift_features(signature_image)
    if descriptors_query is None or len(keypoints_query) == 0:
        print("No descriptors found in the signature image.")
        return "None"

    # Check if the descriptors are considered an anomaly by Isolation Forest
    if iso_forest_model.predict(descriptors_query).sum() < 0:
        print("The signature is considered an anomaly and does not match the database.")
        return "None"

    bf = cv2.BFMatcher(cv2.NORM_L2, crossCheck=False)

    match_counts = []  # Store the number of good matches for each database signature
    labels = []        # Corresponding labels

    for data in database_signatures:
        descriptors_train = data['descriptors']
        label = data['label']

        # Match descriptors using k-NN
        matches = bf.knnMatch(descriptors_query, descriptors_train, k=2)

        # Apply Lowe's ratio test to find good matches
        good_matches = [m for m, n in matches if m.distance < 0.65 * n.distance]
        num_good_matches = len(good_matches)

        match_counts.append(num_good_matches)
        labels.append(label)

    # Identify the best match
    best_match_count = max(match_counts)
    best_label = labels[match_counts.index(best_match_count)]
    match_counts_sorted = sorted(match_counts, reverse=True)
    second_best_match_count = match_counts_sorted[1] if len(match_counts_sorted) > 1 else 0

    # Calculate mean and standard deviation for analysis
    mean_matches = np.mean(match_counts)
    std_matches = np.std(match_counts)

    # Decision criteria based on match distribution
    distribution_threshold = mean_matches + 0.75 * std_matches
    significant_difference = best_match_count - second_best_match_count >= 5

    if best_match_count >= distribution_threshold and significant_difference:
        print(f"The signature belongs to: {best_label} with {best_match_count} good matches.")
        return best_label
    else:
        print("The signature does not match any in the database.")
        return "None"
# ==================================================================================#
#      Function to process a document and verify its signature                      #
# ==================================================================================#

def process_document(document_image_path):
    """
    Process the document image to detect and verify the signature.

    Parameters:
        document_image_path (str): The file path to the document image.
    """
    # Load the document image
    document_image = cv2.imread(document_image_path)
    if document_image is not None:
        # Detect the signature
        signature_image = detect_signature(document_image)
        if signature_image is not None:
            # Upscale the extracted signature if it's small
            height, width = signature_image.shape[:2]
            if height < 100 or width < 100:
                signature_image = upscale_image(signature_image, scale_factor=1.5)

            # Verify the signature using Isolation Forest
            predicted_label = verify_signature_with_isolation_forest(signature_image)

            if predicted_label == "None":
                print("The signature does not match any in the database.")
            else:
                print(f"The signature belongs to: {predicted_label}")
            return predicted_label
        else:
            print("Signature not detected in the document.")
            return "None"
    else:
        print("Could not load the document image.")
        return "None"
# ==================================================================================#
#                           Evaluation and Performance                              #
# ==================================================================================#

if __name__ == "__main__":
    # Path to the folder containing the test documents
    test_documents_folder = 'test_documents'  # Replace with the actual path to your test documents folder

    # Load the true labels from the CSV file
    csv_path = 'test_document_labels.csv'  # Adjust path if needed
    df = pd.read_csv(csv_path)

    # Ensure that any NaN values in the 'true_label' column are replaced with 'None'
    df['true_label'] = df['true_label'].fillna('None').astype(str)

    # Map file names to their true labels
    true_label_map = {row['filename']: row['true_label'] for _, row in df.iterrows()}

    # Prepare lists to store true labels and predicted labels
    y_true = []
    y_pred = []

    # Iterate through each file in the test documents folder
    for file_name in sorted(os.listdir(test_documents_folder)):
        document_image_path = os.path.join(test_documents_folder, file_name)
        
        # Process the document and get the prediction
        print(f"Processing file: {file_name}")
        predicted_label = process_document(document_image_path)
        print()  # Print a blank line for better readability

        # Get the true label
        true_label = true_label_map.get(file_name, 'None')

        y_true.append(true_label)
        y_pred.append(predicted_label)

    # Calculate the confusion matrix and classification metrics
    unique_labels = sorted(set(y_true) | set(y_pred), key=lambda x: str(x))
    conf_matrix = confusion_matrix(y_true, y_pred, labels=unique_labels)
    classification_rep = classification_report(y_true, y_pred, labels=unique_labels, zero_division=0)

    # Display the results
    print("Confusion Matrix:")
    print(conf_matrix)
    print("\nClassification Report:")
    print(classification_rep)


SIFT features extracted for 18 database signatures.
Processing file: Employee_1.tif
The signature belongs to: J.R Idle with 21 good matches.
The signature belongs to: J.R Idle

Processing file: Employee_10.tif
The signature belongs to: Christopher J with 110 good matches.
The signature belongs to: Christopher J

Processing file: Employee_11.tif
The signature belongs to: Vincent F with 58 good matches.
The signature belongs to: Vincent F

Processing file: Employee_12.tif
The signature belongs to: Vigo G Nielsen with 32 good matches.
The signature belongs to: Vigo G Nielsen

Processing file: Employee_13.tif
The signature belongs to: H. Thomas with 199 good matches.
The signature belongs to: H. Thomas

Processing file: Employee_14.tif
The signature belongs to: O. Stuhl with 37 good matches.
The signature belongs to: O. Stuhl

Processing file: Employee_15.tif
The signature belongs to: Eppin Pipp with 160 good matches.
The signature belongs to: Eppin Pipp

Processing file: Employee_16.tif
T